Пример #1
0
def test_too_many_neighbors_warns():
    u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random")
    u.fit(
        nn_data[:100,]
    )
    assert_equal(u._a, 1.2)
    assert_equal(u._b, 1.75)
Пример #2
0
def test_bad_output_metric():
    u = UMAP(output_metric="foobar")
    assert_raises(ValueError, u.fit, nn_data)
    u = UMAP(output_metric="precomputed")
    assert_raises(ValueError, u.fit, nn_data)
    u = UMAP(output_metric="hamming")
    assert_raises(ValueError, u.fit, nn_data)
Пример #3
0
def test_umap_transform_on_iris():
    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)

    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert (
        trust >= 0.89
    ), "Insufficiently trustworthy transform for iris dataset: {}".format(trust)
Пример #4
0
def test_umap_transform_on_iris_modified_dtype():
    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    fitter.embedding_ = fitter.embedding_.astype(np.float64)

    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert_greater_equal(
        trust,
        0.89,
        "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust),
    )
def metrics(model, data_iterator):
    """
    Summary:

    Args:

    Returns:

    """
    umap_proj = UMAP(metric='euclidean', n_neighbors=200, low_memory=True)
    hdb_clusterer = hdbscan.HDBSCAN(
        min_samples=100,
        min_cluster_size=100,
    )
    ads_pred = []
    ads_actual = []
    total_duration = []
    pred_ads_duration = []
    for i, (data, labels) in tqdm(enumerate(data_iterator)):
        aud_len = MP3_META(data).info.length
        total_duration.append(aud_len)
        aud_data = load_audio(data)
        embeds, (aud_splits, _) = encoder.embed(aud_data, group=False)
        print(data, "Embed done")
        try:
            projs = umap_proj.fit_transform(embeds)
            print(data, "Created Projections")
        except Exception as e:
            print(e)
            continue
        clusters = hdb_clusterer.fit_predict(projs)
        print(data, "Created Clusters")

        ad_dir, ads = segment_ads(aud_data, aud_splits, data, clusters)
        pred_ads_duration.append(len(ads) * 10)
        ads_pred.append(len(ads))
        ads_actual.append(labels)
        print(data, "Done segmenting ads")

        plt.scatter(projs[:, 0], projs[:, 1], cmap='Spectral')
        plt.title(str(Counter(clusters)))
        plt.savefig('{}/{}_umap.jpg'.format(ad_dir, data.split('/')[-1]))
        plt.close()
        plt.plot(clusters)
        plt.savefig('{}/{}_hdb_labels.jpg'.format(ad_dir, data.split('/')[-1]))
        plt.close()

        continue
Пример #6
0
def test_multi_component_layout():
    data, labels = datasets.make_blobs(100,
                                       2,
                                       centers=5,
                                       cluster_std=0.5,
                                       center_box=[-20, 20],
                                       random_state=42)

    true_centroids = np.empty((labels.max() + 1, data.shape[1]),
                              dtype=np.float64)

    for label in range(labels.max() + 1):
        true_centroids[label] = data[labels == label].mean(axis=0)

    true_centroids = normalize(true_centroids, norm="l2")

    embedding = UMAP(n_neighbors=4).fit_transform(data)
    embed_centroids = np.empty((labels.max() + 1, data.shape[1]),
                               dtype=np.float64)
    embed_labels = KMeans(n_clusters=5).fit_predict(embedding)

    for label in range(embed_labels.max() + 1):
        embed_centroids[label] = data[embed_labels == label].mean(axis=0)

    embed_centroids = normalize(embed_centroids, norm="l2")

    error = np.sum((true_centroids - embed_centroids)**2)

    assert_less(error, 15.0, msg="Multi component embedding to far astray")
Пример #7
0
def test_umap_trustworthiness_on_iris():
    data = iris.data
    embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(data)
    trust = trustworthiness(iris.data, embedding, 10)
    assert (
        trust >= 0.97
    ), "Insufficiently trustworthy embedding for iris dataset: {}".format(trust)
Пример #8
0
def test_umap_trustworthiness_on_sphere_iris():
    data = iris.data
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        n_epochs=200,
        random_state=42,
        output_metric="haversine",
    ).fit_transform(data)
    # Since trustworthiness doesn't support haversine, project onto
    # a 3D embedding of the sphere and use cosine distance
    r = 3
    projected_embedding = np.vstack(
        [
            r * np.sin(embedding[:, 0]) * np.cos(embedding[:, 1]),
            r * np.sin(embedding[:, 0]) * np.sin(embedding[:, 1]),
            r * np.cos(embedding[:, 0]),
        ]
    ).T
    trust = trustworthiness(iris.data, projected_embedding, 10, metric="cosine")
    assert_greater_equal(
        trust,
        0.80,
        "Insufficiently trustworthy spherical embedding for iris dataset: {}".format(
            trust
        ),
    )
Пример #9
0
def test_bad_too_large_min_dist():
    u = UMAP(min_dist=2.0)
    # a RuntimeWarning about division by zero in a,b curve fitting is expected
    # caught and ignored for this test
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        assert_raises(ValueError, u.fit, nn_data)
Пример #10
0
def test_umap_transform_embedding_stability():
    """Test that transforming data does not alter the learned embeddings

    Issue #217 describes how using transform to embed new data using a
    trained UMAP transformer causes the fitting embedding matrix to change
    in cases when the new data has the same number of rows as the original
    training data.
    """

    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    original_embedding = fitter.embedding_.copy()

    # The important point is that the new data has the same number of rows
    # as the original fit data
    new_data = np.random.random(data.shape)
    embedding = fitter.transform(new_data)

    assert_array_equal(
        original_embedding,
        fitter.embedding_,
        "Transforming new data changed the original embeddings",
    )

    # Example from issue #217
    a = np.random.random((1000, 10))
    b = np.random.random((1000, 5))

    umap = UMAP()
    u1 = umap.fit_transform(a[:, :5])
    u1_orig = u1.copy()
    assert_array_equal(u1_orig, umap.embedding_)

    u2 = umap.transform(b)
    assert_array_equal(u1_orig, umap.embedding_)
Пример #11
0
def test_umap_sparse_trustworthiness():
    embedding = UMAP(n_neighbors=10).fit_transform(sparse_test_data[:100])
    trust = trustworthiness(sparse_test_data[:100].toarray(), embedding, 10)
    assert_greater_equal(
        trust,
        0.91,
        "Insufficiently trustworthy embedding for"
        "sparse test dataset: {}".format(trust),
    )
Пример #12
0
def test_supervised_umap_trustworthiness():
    data, labels = datasets.make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(
        data, labels
    )
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.97,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    )
Пример #13
0
def test_semisupervised_umap_trustworthiness_on_iris():
    data = iris.data
    target = iris.target.copy()
    target[25:75] = -1
    embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(
        data, target
    )
    trust = trustworthiness(iris.data, embedding, 10)
    assert (
        trust >= 0.97
    ), "Insufficiently trustworthy embedding for iris dataset: {}".format(trust)
Пример #14
0
def test_supervised_umap_trustworthiness_on_iris():
    data = iris.data
    embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(
        data, iris.target
    )
    trust = trustworthiness(iris.data, embedding, 10)
    assert_greater_equal(
        trust,
        0.97,
        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
    )
Пример #15
0
def test_sklearn_digits():
    digits = datasets.load_digits()
    data = digits.data
    embedding = UMAP(n_neighbors=5, min_dist=0.01,
                     random_state=42).fit_transform(data)
    #np.save('digits_embedding_42.npy', embedding)
    to_match = np.load(os.path.join(os.path.dirname(__file__),
                                    'digits_embedding_42.npy'))
    assert_array_almost_equal(embedding, to_match, err_msg='Digits embedding '
                                                           'is not consistent '
                                                           'with previous runs')
Пример #16
0
def test_initialized_umap_trustworthiness_on_iris():
    data = iris.data
    embedding = UMAP(
        n_neighbors=10, min_dist=0.01, init=data[:, 2:], n_epochs=200, random_state=42
    ).fit_transform(data)
    trust = trustworthiness(iris.data, embedding, 10)
    assert_greater_equal(
        trust,
        0.97,
        "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust),
    )
Пример #17
0
def test_umap_trustworthiness_random_init():
    data = nn_data[:50]
    embedding = UMAP(
        n_neighbors=10, min_dist=0.01, random_state=42, init="random"
    ).fit_transform(data)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.75,
        "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust),
    )
Пример #18
0
def test_umap_sparse_transform_on_iris():
    data = sparse.csr_matrix(iris.data[iris_selection])
    assert sparse.issparse(data)
    fitter = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        random_state=42,
        n_epochs=100,
        force_approximation_algorithm=True,
    ).fit(data)

    new_data = sparse.csr_matrix(iris.data[~iris_selection])
    assert sparse.issparse(new_data)
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert_greater_equal(
        trust,
        0.85,
        "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust),
    )
Пример #19
0
def plot(features, labels, classes, path):
    print(features.shape, labels.shape)
    
    features, labels = shuffle(features, labels)
    
    print('Plotting UMAP...', end='')
    features = features.reshape(features.shape[0], -1)
    embedding = UMAP(n_neighbors=20, min_dist=1, metric='correlation', random_state=1, transform_seed=1).fit_transform(features)
    colours = ListedColormap(colour_list[:len(classes)])
    scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, s=3, alpha=1, cmap='plasma')

    plt.legend(handles=scatter.legend_elements()[0], labels=classes, loc='best',ncol=1, fontsize=6)
    plt.savefig(str(path / 'umap.png'), dpi=300)
    plt.close()
    print('done.')
Пример #20
0
def test_umap_trustworthiness_fast_approx():
    data = nn_data[:50]
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        random_state=42,
        n_epochs=100,
        force_approximation_algorithm=True,
    ).fit_transform(data)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.75,
        "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust),
    )
Пример #21
0
def test_discrete_metric_supervised_umap_trustworthiness():
    data, labels = datasets.make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="ordinal",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    )
Пример #22
0
    def nmf_training(self, nmf_training_params: NMFTrainingParams):
        # 创建训练状态信息对象
        self.nmf_training_progress = NMFTrainingProgress()
        self.umap_progress = UMAPProgress()

        # 创建SSNMF和UMAP对象
        self.ss_nmf = SSNMFTopicModel(self.callback_set_nmf_training_progress,
                                      self.text_preprocessing.tfidfWD,
                                      nmf_training_params.topic_num,
                                      min_iter=nmf_training_params.min_iter,
                                      max_iter=nmf_training_params.max_iter,
                                      tol=nmf_training_params.tolerance,
                                      seed=nmf_training_params.random_seed)
        """
        self.t_sne = TSNEWithCallback(
            update_progress_callback=self.callback_set_tsne_progress,
            update_progress_each_iter=10,
            n_components=3,
            perplexity=nmf_training_params.perplexity,
            learning_rate=nmf_training_params.learning_rate,
            n_iter=nmf_training_params.tsne_max_iter,
            metric=custom_doc_distance_callback(nmf_training_params.scaling_ratio),
            init="pca",
            random_state=nmf_training_params.random_seed
        )
        """
        print("参数值:")
        print(nmf_training_params.n_neighbors)
        print(nmf_training_params.min_dist)
        self.umap = UMAP(n_neighbors=nmf_training_params.n_neighbors,
                         n_components=nmf_training_params.dimension,
                         metric=custom_doc_distance_callback(
                             nmf_training_params.scaling_ratio),
                         min_dist=nmf_training_params.min_dist)

        # 开始训练
        self.start_nmf_and_umap()
Пример #23
0
X = dataset.data
y = dataset.target

# Generate shape graph using KeplerMapper
mapper = KeplerMapper(verbose=1)
lens = mapper.fit_transform(X, projection=[0])
graph = mapper.map(lens, X, nr_cubes=6, overlap_perc=0.2)

# Convert to a DyNeuGraph
dG = DyNeuGraph(G=graph, y=y)

# Define some custom_layouts
dG.add_custom_layout(lens, name='lens')
dG.add_custom_layout(nx.spring_layout, name='nx.spring')
dG.add_custom_layout(nx.kamada_kawai_layout, name='nx.kamada_kawai')
dG.add_custom_layout(nx.spectral_layout, name='nx.spectral')
dG.add_custom_layout(nx.circular_layout, name='nx.circular')

# Configure some projections
pca = PCA(2, random_state=1)
tsne = TSNE(2, init='pca', random_state=1)
umap = UMAP(n_components=2, init=pca.fit_transform(X))

# Add projections as custom_layouts
dG.add_custom_layout(pca.fit_transform(X), name='PCA')
dG.add_custom_layout(tsne.fit_transform(X), name='TSNE')
dG.add_custom_layout(umap.fit_transform(X, y=None), name='UMAP')

# Visualize
dG.visualize(static=True, show=True)
Пример #24
0
def test_bad_transform_data():
    u = UMAP().fit([[1, 1, 1, 1]])
    assert_raises(ValueError, u.transform, [[0, 0, 0, 0]])
Пример #25
0
def test_haversine_embed_to_highd():
    u = UMAP(n_components=3, output_metric="haversine")
    assert_raises(ValueError, u.fit, nn_data)
Пример #26
0
def repeated_points_large_n():
    model = UMAP(n_neighbors=5, unique=True).fit(repetition_dense)
    assert_equal(model._n_neighbors, 3)
Пример #27
0
def repeated_points_small_dense_binary():
    model = UMAP(n_neighbors=3, unique=True).fit(binary_repeats)
    # assert_equal(np.unique(binary_repeats[0:2], axis=0).shape[0],1)
    assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
Пример #28
0
def repeated_points_large_dense_binary():
    model = UMAP(n_neighbors=3, unique=True, force_approximation_algorithm=True).fit(
        binary_repeats
    )
    assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
Пример #29
0
def test_umap_fit_params():
    # x and y are required to be the same length
    u = UMAP()
    x = np.random.uniform(0, 1, (256, 10))
    y = np.random.randint(10, size=(257,))
    assert_raises(ValueError, u.fit, x, y)

    u = UMAP()
    x = np.random.uniform(0, 1, (256, 10))
    y = np.random.randint(10, size=(255,))
    assert_raises(ValueError, u.fit, x, y)

    u = UMAP()
    x = np.random.uniform(0, 1, (256, 10))
    assert_raises(ValueError, u.fit, x, [])

    u = UMAP()
    x = np.random.uniform(0, 1, (256, 10))
    y = np.random.randint(10, size=(256,))
    res = u.fit(x, y)
    assert isinstance(res, UMAP)

    u = UMAP()
    x = np.random.uniform(0, 1, (256, 10))
    res = u.fit(x)
    assert isinstance(res, UMAP)
Пример #30
0
def repeated_points_small_sparse_spatial():
    model = UMAP(n_neighbors=3, unique=True).fit(sparse_spatial_data_repeats)
    assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)