def test_umap_downcast_fails(input_type, nrows, n_feats): n_samples = nrows n_feats = n_feats X, y = datasets.make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) # Test fit() fails with double precision when should_downcast set to False umap = cuUMAP(should_downcast=False, verbose=False) if input_type == 'dataframe': X = cudf.DataFrame.from_pandas(pd.DataFrame(X)) with pytest.raises(Exception): umap.fit(X, should_downcast=False, convert_dtype=False) # Test fit() fails when downcast corrupted data X = np.array([[np.finfo(np.float32).max]], dtype=np.float64) umap = cuUMAP(should_downcast=True) if input_type == 'dataframe': X = cudf.DataFrame.from_pandas(pd.DataFrame(X)) with pytest.raises(Exception): umap.fit(X, convert_dtype=True)
def test_umap_fit_transform_against_fit_and_transform(): n_samples = 500 n_features = 20 data, labels = make_blobs(n_samples=n_samples, n_features=n_features, centers=10, random_state=42) """ First test the default option does not hash the input """ cuml_model = cuUMAP(verbose=False) ft_embedding = cuml_model.fit_transform(data, convert_dtype=True) fit_embedding_same_input = cuml_model.transform(data, convert_dtype=True) assert joblib.hash(ft_embedding) != joblib.hash(fit_embedding_same_input) """ Next, test explicitly enabling feature hashes the input """ cuml_model = cuUMAP(hash_input=True, verbose=False) ft_embedding = cuml_model.fit_transform(data, convert_dtype=True) fit_embedding_same_input = cuml_model.transform(data, convert_dtype=True) assert joblib.hash(ft_embedding) == joblib.hash(fit_embedding_same_input) fit_embedding_diff_input = cuml_model.transform(data[1:], convert_dtype=True) assert joblib.hash(ft_embedding) != joblib.hash(fit_embedding_diff_input)
def transform_embed(knn_graph=None): model = cuUMAP(random_state=42, init='random', n_neighbors=n_neighbors) model.fit(data, knn_graph=knn_graph, convert_dtype=True) return model.transform(data, knn_graph=knn_graph, convert_dtype=True)
def get_embedding(n_components, random_state): reducer = cuUMAP(verbose=False, init="random", n_components=n_components, random_state=random_state) reducer.fit(fit_data, convert_dtype=True) return reducer.transform(transform_data, convert_dtype=True)
def test_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = cuUMAP(n_neighbors=10, min_dist=0.01).fit_transform(data, convert_dtype=True) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97
def test_umap_transform_on_digits(target_metric): digits = datasets.load_digits() digits_selection = np.random.RandomState(42).choice([True, False], 1797, replace=True, p=[0.75, 0.25]) data = digits.data[digits_selection] fitter = cuUMAP(n_neighbors=15, verbose=logger.level_debug, init="random", n_epochs=0, min_dist=0.01, random_state=42, target_metric=target_metric) fitter.fit(data, convert_dtype=True) new_data = digits.data[~digits_selection] embedding = fitter.transform(new_data, convert_dtype=True) trust = trustworthiness(digits.data[~digits_selection], embedding, n_neighbors=15) assert trust >= 0.96
def test_umap_fit_transform_trustworthiness_with_consistency_enabled(): iris = datasets.load_iris() data = iris.data algo = cuUMAP(n_neighbors=10, min_dist=0.01, random_state=42) embedding = algo.fit_transform(data, convert_dtype=True) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97
def test_umap_fit_transform_score(nrows, n_feats): n_samples = nrows n_features = n_feats data, labels = make_blobs(n_samples=n_samples, n_features=n_features, centers=10, random_state=42) model = umap.UMAP(n_neighbors=10, min_dist=0.1) cuml_model = cuUMAP(n_neighbors=10, min_dist=0.01) embedding = model.fit_transform(data) cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True) assert not np.isnan(embedding).any() assert not np.isnan(cuml_embedding).any() if nrows < 500000: cuml_score = adjusted_rand_score( labels, KMeans(10).fit_predict(cuml_embedding)) score = adjusted_rand_score(labels, KMeans(10).fit_predict(embedding)) assert array_equal(score, cuml_score, 1e-2, with_sign=True)
def test_umap_fit_transform_trust(name): if name == 'iris': iris = datasets.load_iris() data = iris.data labels = iris.target elif name == 'digits': digits = datasets.load_digits(n_class=5) data = digits.data labels = digits.target elif name == 'wine': wine = datasets.load_wine() data = wine.data labels = wine.target else: data, labels = make_blobs(n_samples=5000, n_features=10, centers=10, random_state=42) model = umap.UMAP(n_neighbors=10, min_dist=0.01) cuml_model = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False) embedding = model.fit_transform(data) cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True) trust = trustworthiness(data, embedding, 10) cuml_trust = trustworthiness(data, cuml_embedding, 10) assert array_equal(trust, cuml_trust, 1e-1, with_sign=True)
def test_umap_data_formats(input_type, should_downcast, nrows, n_feats, name): dtype = np.float32 if not should_downcast else np.float64 n_samples = nrows n_feats = n_feats if name == 'digits': # use the digits dataset for unit test digits = datasets.load_digits(n_class=9) X = digits["data"].astype(dtype) else: X, y = datasets.make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) umap = cuUMAP(n_neighbors=3, n_components=2, verbose=False) if input_type == 'dataframe': X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_pd) embeds = umap.fit_transform(X_cudf, convert_dtype=True) assert type(embeds) == cudf.DataFrame else: embeds = umap.fit_transform(X) assert type(embeds) == np.ndarray
def test_supervised_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False).fit_transform(data, iris.target) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97 - TRUST_TOLERANCE_THRESH
def test_supervised_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = cuUMAP(n_neighbors=10, random_state=0, min_dist=0.01).fit_transform( data, iris.target, convert_dtype=True) trust = trustworthiness(iris.data, embedding, n_neighbors=10) assert trust >= 0.97
def test_blobs_cluster(nrows, n_feats): data, labels = datasets.make_blobs( n_samples=nrows, n_features=n_feats, centers=5, random_state=0) embedding = cuUMAP(verbose=False).fit_transform(data, convert_dtype=True) if nrows < 500000: score = adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)) assert score == 1.0
def compare_exp_decay_params(a=None, b=None, min_dist=0.1, spread=1.0): cuml_model = cuUMAP(a=a, b=b, min_dist=min_dist, spread=spread) state = cuml_model.__getstate__() cuml_a, cuml_b = state['a'], state['b'] skl_model = umap.UMAP(a=a, b=b, min_dist=min_dist, spread=spread) skl_model.fit(np.zeros((1, 1))) sklearn_a, sklearn_b = skl_model._a, skl_model._b assert abs(cuml_a) - abs(sklearn_a) < 1e-6 assert abs(cuml_b) - abs(sklearn_b) < 1e-6
def test_semisupervised_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data target = iris.target.copy() target[25:75] = -1 embedding = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False).fit_transform(data, target, convert_dtype=True) trust = trustworthiness(iris.data, embedding, 10) assert trust >= 0.97 - TRUST_TOLERANCE_THRESH
def test_umap_trustworthiness_on_iris(): iris = datasets.load_iris() data = iris.data embedding = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False).fit_transform(data, convert_dtype=True) trust = trustworthiness(iris.data, embedding, 10) # We are doing a spectral embedding but not a # multi-component layout (which is marked experimental). # As a result, our score drops by 0.006. assert trust >= 0.964 - TRUST_TOLERANCE_THRESH
def test_umap_transform_trustworthiness_with_consistency_enabled(): iris = datasets.load_iris() data = iris.data selection = np.random.RandomState(42).choice( [True, False], data.shape[0], replace=True, p=[0.5, 0.5]) fit_data = data[selection] transform_data = data[~selection] model = cuUMAP(n_neighbors=10, min_dist=0.01, random_state=42, verbose=False) model.fit(fit_data, convert_dtype=True) embedding = model.transform(transform_data, convert_dtype=True) trust = trustworthiness(transform_data, embedding, 10) assert trust >= 0.92
def test_umap_transform_on_iris(): iris = datasets.load_iris() iris_selection = np.random.RandomState(42).choice( [True, False], 150, replace=True, p=[0.75, 0.25]) data = iris.data[iris_selection] fitter = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False) fitter.fit(data, convert_dtype=True) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data, convert_dtype=True) trust = trustworthiness(new_data, embedding, 10) assert trust >= 0.89
def test_supervised_umap_trustworthiness_against_umap_learn(): iris = datasets.load_iris() data = iris.data embedding = cuUMAP(n_neighbors=10, min_dist=0.01, verbose=False).fit_transform(data, iris.target, convert_dtype=True) skl_embedding = umap.UMAP(n_neighbors=10, min_dist=0.01, verbose=False).fit_transform(data, iris.target) trust = trustworthiness(iris.data, embedding, 10) skl_trust = trustworthiness(iris.data, skl_embedding, 10) assert (skl_trust - 0.009) <= trust <= (skl_trust + 0.009)
def test_umap_transform_on_iris(target_metric): iris = datasets.load_iris() iris_selection = np.random.RandomState(42).choice( [True, False], 150, replace=True, p=[0.75, 0.25]) data = iris.data[iris_selection] fitter = cuUMAP(n_neighbors=10, init="random", n_epochs=800, min_dist=0.01, random_state=42, target_metric=target_metric) fitter.fit(data, convert_dtype=True) new_data = iris.data[~iris_selection] embedding = fitter.transform(new_data, convert_dtype=True) assert not np.isnan(embedding).any() trust = trustworthiness(new_data, embedding, n_neighbors=10) assert trust >= 0.85
def test_umap_transform_on_digits_sparse(target_metric, input_type, xform_method): digits = datasets.load_digits() digits_selection = np.random.RandomState(42).choice([True, False], 1797, replace=True, p=[0.75, 0.25]) if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(digits.data[digits_selection])) fitter = cuUMAP(n_neighbors=15, verbose=logger.level_info, init="random", n_epochs=0, min_dist=0.01, random_state=42, target_metric=target_metric) new_data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(digits.data[~digits_selection])) if xform_method == 'fit': fitter.fit(data, convert_dtype=True) embedding = fitter.transform(new_data, convert_dtype=True) else: embedding = fitter.fit_transform(new_data, convert_dtype=True) if input_type == 'cupy': embedding = embedding.get() trust = trustworthiness(digits.data[~digits_selection], embedding, n_neighbors=15) assert trust >= 0.96
def test_umap_data_formats(input_type, should_downcast, nrows, n_feats, name): dtype = np.float32 if not should_downcast else np.float64 n_samples = nrows n_feats = n_feats if name == 'digits': # use the digits dataset for unit test digits = datasets.load_digits(n_class=9) X = digits["data"].astype(dtype) else: X, y = datasets.make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) umap = cuUMAP(n_neighbors=3, n_components=2, verbose=False) embeds = umap.fit_transform(X) assert type(embeds) == np.ndarray
def test_umap_fit_transform_score_default(): n_samples = 500 n_features = 20 data, labels = make_blobs(n_samples=n_samples, n_features=n_features, centers=10, random_state=42) model = umap.UMAP() cuml_model = cuUMAP(verbose=False) embedding = model.fit_transform(data) cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True) cuml_score = adjusted_rand_score(labels, KMeans(10).fit_predict(cuml_embedding)) score = adjusted_rand_score(labels, KMeans(10).fit_predict(embedding)) assert array_equal(score, cuml_score, 1e-2, with_sign=True)
def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors): n_clusters = 30 random_state = 42 X, _ = make_blobs(n_samples=n_rows, centers=n_clusters, n_features=n_features, random_state=random_state) model = cuUMAP(n_neighbors=n_neighbors) model.fit(X) cu_fss_graph = model.graph_ model = umap.UMAP(n_neighbors=n_neighbors) model.fit(X) ref_fss_graph = model.graph_ cu_fss_graph = cu_fss_graph.todense() ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense() assert correctness_sparse(ref_fss_graph, cu_fss_graph, atol=0.1, rtol=0.2, threshold=0.95)
def get_embedding(n_components, random_state): reducer = cuUMAP(init="random", n_components=n_components, random_state=random_state) return reducer.fit_transform(data, convert_dtype=True)
def encoder_latent_umaps(workdir, outdir, epochs, n_particles_total, subset, random_seed, use_umap_gpu, random_state, n_epochs_umap, LOG): ''' Calculates UMAP embeddings of subset of particles' selected epochs' latent encodings Inputs workdir: path to directory containing cryodrgn training results outdir: path to base directory to save outputs epochs: array of epochs for which to calculate UMAPs n_particles_total: int of total number of particles trained subset: int, size of subset on which to calculate umap, None means all random_seed: int, seed for random selection of subset particles use_umap_gpu: bool, whether to use the cuML library to GPU accelerate UMAP calculations (if available in env) random_state: int, random state seed used by UMAP for reproducibility at slight cost of performance (None means faster but non-reproducible) Outputs pkl of each UMAP embedding stored in outdir/umaps/umap.epoch.pkl png of all UMAPs # apparently running multiple UMAP embeddings (i.e. for each epoch's z.pkl) in parallel on CPU requires difficult backend setup # see https://github.com/lmcinnes/umap/issues/707 # therefore not implemented currently ''' if subset == 'None': n_particles_subset = n_particles_total flog('Using full particle stack for UMAP', LOG) else: if random_seed is None: random_seed = random.randint(0, 100000) random.seed(random_seed) else: random.seed(random_seed) n_particles_subset = min(n_particles_total, int(subset)) flog(f'Randomly selecting {n_particles_subset} particle subset on which to run UMAP (with random seed {random_seed})', LOG) ind_subset = sorted(random.sample(range(0, n_particles_total), k=n_particles_subset)) utils.save_pkl(ind_subset, outdir + '/ind_subset.pkl') for epoch in epochs: flog(f'Now calculating UMAP for epoch {epoch} with random_state {random_state}', LOG) z = utils.load_pkl(workdir + f'/z.{epoch}.pkl')[ind_subset, :] if use_umap_gpu: #using cuML library GPU-accelerated UMAP reducer = cuUMAP(random_state=random_state, n_epochs=n_epochs_umap) umap_embedding = reducer.fit_transform(z) else: #using umap-learn library CPU-bound UMAP reducer = umap.UMAP(random_state=random_state) umap_embedding = reducer.fit_transform(z) utils.save_pkl(umap_embedding, f'{outdir}/umaps/umap.{epoch}.pkl') n_cols = int(np.ceil(len(epochs) ** 0.5)) n_rows = int(np.ceil(len(epochs) / n_cols)) fig, axes = plt.subplots(n_rows, n_cols, figsize=(2 * n_cols, 2 * n_rows), sharex='all', sharey='all') fig.tight_layout() for i, ax in enumerate(axes.flat): try: umap_embedding = utils.load_pkl(f'{outdir}/umaps/umap.{epochs[i]}.pkl') toplot = ax.hexbin(umap_embedding[:, 0], umap_embedding[:, 1], bins='log', mincnt=1) ax.set_title(f'epoch {epochs[i]}') except IndexError: pass except FileNotFoundError: flog(f'Could not find file {outdir}/umaps/umap.{epoch}.pkl', LOG) pass if len(axes.shape) == 1: axes[0].set_ylabel('UMAP2') for a in axes[:]: a.set_xlabel('UMAP1') else: assert len(axes.shape) == 2 #there are more than one row and column of axes for a in axes[:, 0]: a.set_ylabel('UMAP2') for a in axes[-1, :]: a.set_xlabel('UMAP1') fig.subplots_adjust(right=0.96) cbar_ax = fig.add_axes([0.98, 0.15, 0.02, 0.7]) cbar = fig.colorbar(toplot, cax=cbar_ax) cbar.ax.set_ylabel('particle density', rotation=90) plt.subplots_adjust(wspace=0.1) plt.subplots_adjust(hspace=0.3) plt.savefig(f'{outdir}/plots/01_encoder_umaps.png', dpi=300, format='png', transparent=True, bbox_inches='tight') flog(f'Saved UMAP distribution plot to {outdir}/plots/01_encoder_umaps.png', LOG)
def fit_transform_embed(knn_graph=None): model = cuUMAP(verbose=False, random_state=42, n_neighbors=n_neighbors) return model.fit_transform(data, knn_graph=knn_graph, convert_dtype=True)