def test_too_many_neighbors_warns(): u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random") u.fit( nn_data[:100,] ) assert_equal(u._a, 1.2) assert_equal(u._b, 1.75)
def test_bad_output_metric(): u = UMAP(output_metric="foobar") assert_raises(ValueError, u.fit, nn_data) u = UMAP(output_metric="precomputed") assert_raises(ValueError, u.fit, nn_data) u = UMAP(output_metric="hamming") assert_raises(ValueError, u.fit, nn_data)
def test_umap_transform_on_iris(): data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert ( trust >= 0.89 ), "Insufficiently trustworthy transform for iris dataset: {}".format(trust)
def test_umap_transform_on_iris_modified_dtype(): data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) fitter.embedding_ = fitter.embedding_.astype(np.float64) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert_greater_equal( trust, 0.89, "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust), )
def metrics(model, data_iterator): """ Summary: Args: Returns: """ umap_proj = UMAP(metric='euclidean', n_neighbors=200, low_memory=True) hdb_clusterer = hdbscan.HDBSCAN( min_samples=100, min_cluster_size=100, ) ads_pred = [] ads_actual = [] total_duration = [] pred_ads_duration = [] for i, (data, labels) in tqdm(enumerate(data_iterator)): aud_len = MP3_META(data).info.length total_duration.append(aud_len) aud_data = load_audio(data) embeds, (aud_splits, _) = encoder.embed(aud_data, group=False) print(data, "Embed done") try: projs = umap_proj.fit_transform(embeds) print(data, "Created Projections") except Exception as e: print(e) continue clusters = hdb_clusterer.fit_predict(projs) print(data, "Created Clusters") ad_dir, ads = segment_ads(aud_data, aud_splits, data, clusters) pred_ads_duration.append(len(ads) * 10) ads_pred.append(len(ads)) ads_actual.append(labels) print(data, "Done segmenting ads") plt.scatter(projs[:, 0], projs[:, 1], cmap='Spectral') plt.title(str(Counter(clusters))) plt.savefig('{}/{}_umap.jpg'.format(ad_dir, data.split('/')[-1])) plt.close() plt.plot(clusters) plt.savefig('{}/{}_hdb_labels.jpg'.format(ad_dir, data.split('/')[-1])) plt.close() continue
def test_multi_component_layout(): data, labels = datasets.make_blobs(100, 2, centers=5, cluster_std=0.5, center_box=[-20, 20], random_state=42) true_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64) for label in range(labels.max() + 1): true_centroids[label] = data[labels == label].mean(axis=0) true_centroids = normalize(true_centroids, norm="l2") embedding = UMAP(n_neighbors=4).fit_transform(data) embed_centroids = np.empty((labels.max() + 1, data.shape[1]), dtype=np.float64) embed_labels = KMeans(n_clusters=5).fit_predict(embedding) for label in range(embed_labels.max() + 1): embed_centroids[label] = data[embed_labels == label].mean(axis=0) embed_centroids = normalize(embed_centroids, norm="l2") error = np.sum((true_centroids - embed_centroids)**2) assert_less(error, 15.0, msg="Multi component embedding to far astray")
def test_umap_trustworthiness_on_iris(): data = iris.data embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) assert ( trust >= 0.97 ), "Insufficiently trustworthy embedding for iris dataset: {}".format(trust)
def test_umap_trustworthiness_on_sphere_iris(): data = iris.data embedding = UMAP( n_neighbors=10, min_dist=0.01, n_epochs=200, random_state=42, output_metric="haversine", ).fit_transform(data) # Since trustworthiness doesn't support haversine, project onto # a 3D embedding of the sphere and use cosine distance r = 3 projected_embedding = np.vstack( [ r * np.sin(embedding[:, 0]) * np.cos(embedding[:, 1]), r * np.sin(embedding[:, 0]) * np.sin(embedding[:, 1]), r * np.cos(embedding[:, 0]), ] ).T trust = trustworthiness(iris.data, projected_embedding, 10, metric="cosine") assert_greater_equal( trust, 0.80, "Insufficiently trustworthy spherical embedding for iris dataset: {}".format( trust ), )
def test_bad_too_large_min_dist(): u = UMAP(min_dist=2.0) # a RuntimeWarning about division by zero in a,b curve fitting is expected # caught and ignored for this test with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) assert_raises(ValueError, u.fit, nn_data)
def test_umap_transform_embedding_stability(): """Test that transforming data does not alter the learned embeddings Issue #217 describes how using transform to embed new data using a trained UMAP transformer causes the fitting embedding matrix to change in cases when the new data has the same number of rows as the original training data. """ data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) original_embedding = fitter.embedding_.copy() # The important point is that the new data has the same number of rows # as the original fit data new_data = np.random.random(data.shape) embedding = fitter.transform(new_data) assert_array_equal( original_embedding, fitter.embedding_, "Transforming new data changed the original embeddings", ) # Example from issue #217 a = np.random.random((1000, 10)) b = np.random.random((1000, 5)) umap = UMAP() u1 = umap.fit_transform(a[:, :5]) u1_orig = u1.copy() assert_array_equal(u1_orig, umap.embedding_) u2 = umap.transform(b) assert_array_equal(u1_orig, umap.embedding_)
def test_umap_sparse_trustworthiness(): embedding = UMAP(n_neighbors=10).fit_transform(sparse_test_data[:100]) trust = trustworthiness(sparse_test_data[:100].toarray(), embedding, 10) assert_greater_equal( trust, 0.91, "Insufficiently trustworthy embedding for" "sparse test dataset: {}".format(trust), )
def test_supervised_umap_trustworthiness(): data, labels = datasets.make_blobs(50, cluster_std=0.5, random_state=42) embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform( data, labels ) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.97, "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust), )
def test_semisupervised_umap_trustworthiness_on_iris(): data = iris.data target = iris.target.copy() target[25:75] = -1 embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform( data, target ) trust = trustworthiness(iris.data, embedding, 10) assert ( trust >= 0.97 ), "Insufficiently trustworthy embedding for iris dataset: {}".format(trust)
def test_supervised_umap_trustworthiness_on_iris(): data = iris.data embedding = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit_transform( data, iris.target ) trust = trustworthiness(iris.data, embedding, 10) assert_greater_equal( trust, 0.97, "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), )
def test_sklearn_digits(): digits = datasets.load_digits() data = digits.data embedding = UMAP(n_neighbors=5, min_dist=0.01, random_state=42).fit_transform(data) #np.save('digits_embedding_42.npy', embedding) to_match = np.load(os.path.join(os.path.dirname(__file__), 'digits_embedding_42.npy')) assert_array_almost_equal(embedding, to_match, err_msg='Digits embedding ' 'is not consistent ' 'with previous runs')
def test_initialized_umap_trustworthiness_on_iris(): data = iris.data embedding = UMAP( n_neighbors=10, min_dist=0.01, init=data[:, 2:], n_epochs=200, random_state=42 ).fit_transform(data) trust = trustworthiness(iris.data, embedding, 10) assert_greater_equal( trust, 0.97, "Insufficiently trustworthy embedding for" "iris dataset: {}".format(trust), )
def test_umap_trustworthiness_random_init(): data = nn_data[:50] embedding = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, init="random" ).fit_transform(data) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.75, "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust), )
def test_umap_sparse_transform_on_iris(): data = sparse.csr_matrix(iris.data[iris_selection]) assert sparse.issparse(data) fitter = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, force_approximation_algorithm=True, ).fit(data) new_data = sparse.csr_matrix(iris.data[~iris_selection]) assert sparse.issparse(new_data) embedding = fitter.transform(new_data) trust = trustworthiness(new_data, embedding, 10) assert_greater_equal( trust, 0.85, "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust), )
def plot(features, labels, classes, path): print(features.shape, labels.shape) features, labels = shuffle(features, labels) print('Plotting UMAP...', end='') features = features.reshape(features.shape[0], -1) embedding = UMAP(n_neighbors=20, min_dist=1, metric='correlation', random_state=1, transform_seed=1).fit_transform(features) colours = ListedColormap(colour_list[:len(classes)]) scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, s=3, alpha=1, cmap='plasma') plt.legend(handles=scatter.legend_elements()[0], labels=classes, loc='best',ncol=1, fontsize=6) plt.savefig(str(path / 'umap.png'), dpi=300) plt.close() print('done.')
def test_umap_trustworthiness_fast_approx(): data = nn_data[:50] embedding = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, n_epochs=100, force_approximation_algorithm=True, ).fit_transform(data) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.75, "Insufficiently trustworthy embedding for" "nn dataset: {}".format(trust), )
def test_discrete_metric_supervised_umap_trustworthiness(): data, labels = datasets.make_blobs(50, cluster_std=0.5, random_state=42) embedding = UMAP( n_neighbors=10, min_dist=0.01, target_metric="ordinal", target_weight=0.8, n_epochs=100, random_state=42, ).fit_transform(data, labels) trust = trustworthiness(data, embedding, 10) assert_greater_equal( trust, 0.95, "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust), )
def nmf_training(self, nmf_training_params: NMFTrainingParams): # 创建训练状态信息对象 self.nmf_training_progress = NMFTrainingProgress() self.umap_progress = UMAPProgress() # 创建SSNMF和UMAP对象 self.ss_nmf = SSNMFTopicModel(self.callback_set_nmf_training_progress, self.text_preprocessing.tfidfWD, nmf_training_params.topic_num, min_iter=nmf_training_params.min_iter, max_iter=nmf_training_params.max_iter, tol=nmf_training_params.tolerance, seed=nmf_training_params.random_seed) """ self.t_sne = TSNEWithCallback( update_progress_callback=self.callback_set_tsne_progress, update_progress_each_iter=10, n_components=3, perplexity=nmf_training_params.perplexity, learning_rate=nmf_training_params.learning_rate, n_iter=nmf_training_params.tsne_max_iter, metric=custom_doc_distance_callback(nmf_training_params.scaling_ratio), init="pca", random_state=nmf_training_params.random_seed ) """ print("参数值:") print(nmf_training_params.n_neighbors) print(nmf_training_params.min_dist) self.umap = UMAP(n_neighbors=nmf_training_params.n_neighbors, n_components=nmf_training_params.dimension, metric=custom_doc_distance_callback( nmf_training_params.scaling_ratio), min_dist=nmf_training_params.min_dist) # 开始训练 self.start_nmf_and_umap()
X = dataset.data y = dataset.target # Generate shape graph using KeplerMapper mapper = KeplerMapper(verbose=1) lens = mapper.fit_transform(X, projection=[0]) graph = mapper.map(lens, X, nr_cubes=6, overlap_perc=0.2) # Convert to a DyNeuGraph dG = DyNeuGraph(G=graph, y=y) # Define some custom_layouts dG.add_custom_layout(lens, name='lens') dG.add_custom_layout(nx.spring_layout, name='nx.spring') dG.add_custom_layout(nx.kamada_kawai_layout, name='nx.kamada_kawai') dG.add_custom_layout(nx.spectral_layout, name='nx.spectral') dG.add_custom_layout(nx.circular_layout, name='nx.circular') # Configure some projections pca = PCA(2, random_state=1) tsne = TSNE(2, init='pca', random_state=1) umap = UMAP(n_components=2, init=pca.fit_transform(X)) # Add projections as custom_layouts dG.add_custom_layout(pca.fit_transform(X), name='PCA') dG.add_custom_layout(tsne.fit_transform(X), name='TSNE') dG.add_custom_layout(umap.fit_transform(X, y=None), name='UMAP') # Visualize dG.visualize(static=True, show=True)
def test_bad_transform_data(): u = UMAP().fit([[1, 1, 1, 1]]) assert_raises(ValueError, u.transform, [[0, 0, 0, 0]])
def test_haversine_embed_to_highd(): u = UMAP(n_components=3, output_metric="haversine") assert_raises(ValueError, u.fit, nn_data)
def repeated_points_large_n(): model = UMAP(n_neighbors=5, unique=True).fit(repetition_dense) assert_equal(model._n_neighbors, 3)
def repeated_points_small_dense_binary(): model = UMAP(n_neighbors=3, unique=True).fit(binary_repeats) # assert_equal(np.unique(binary_repeats[0:2], axis=0).shape[0],1) assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
def repeated_points_large_dense_binary(): model = UMAP(n_neighbors=3, unique=True, force_approximation_algorithm=True).fit( binary_repeats ) assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)
def test_umap_fit_params(): # x and y are required to be the same length u = UMAP() x = np.random.uniform(0, 1, (256, 10)) y = np.random.randint(10, size=(257,)) assert_raises(ValueError, u.fit, x, y) u = UMAP() x = np.random.uniform(0, 1, (256, 10)) y = np.random.randint(10, size=(255,)) assert_raises(ValueError, u.fit, x, y) u = UMAP() x = np.random.uniform(0, 1, (256, 10)) assert_raises(ValueError, u.fit, x, []) u = UMAP() x = np.random.uniform(0, 1, (256, 10)) y = np.random.randint(10, size=(256,)) res = u.fit(x, y) assert isinstance(res, UMAP) u = UMAP() x = np.random.uniform(0, 1, (256, 10)) res = u.fit(x) assert isinstance(res, UMAP)
def repeated_points_small_sparse_spatial(): model = UMAP(n_neighbors=3, unique=True).fit(sparse_spatial_data_repeats) assert_equal(np.unique(model.embedding_[0:2], axis=0).shape[0], 1)