def umap(self, n_components, metric, data=None): model= UMAP(n_components=n_components,metric=metric) if data is not None: reduced_data = model.fit_transform(data) else: reduced_data = model.fit_transform(self.data) return reduced_data
def plot_umap_proc(self, df): folder = self.plot_path umap_2d = UMAP(n_components=2, spread=1, min_dist=0.5, a=0.7, b=1.2) umap_3d = UMAP(n_components=3, spread=1, min_dist=0.5, a=0.7, b=1.2) proj_2d = umap_2d.fit_transform(np.array(df.Vector.tolist())) proj_3d = umap_3d.fit_transform(np.array(df.Vector.tolist())) self.plot_umap(folder,proj_2d,proj_3d,df.Categ,"Categ","category-umap") self.plot_umap(folder,proj_2d,proj_3d,df.subject,"subject","subject-umap") self.plot_umap(folder,proj_2d,proj_3d,df.chn,"chn","channel-umap")
class manifold_umap(base_manifold): def __init__(self, parent=None, name='none'): base_manifold.__init__(self, parent=parent, name=name, manifold_type='UMAP') def train(self, num_pc, n_neighbors=None, min_dist=0.3): """ **Purpose** Train the UMAP on the first <num_pc> components of a PCA UMAP is generally too computationally heavy to do on a full dataset, so you should choose the first few PCs to train the tSNE. Check the pca module for a PCA interface you can use to select the best PCs **Arguments** n_neighbors (Required) Estimated number of neighbours min_dist (Optional, default=0.3) minimum distance between points **Returns** None """ assert self.configured, 'umap is not configured, run configure() first' assert n_neighbors, 'You must specify an estimate for n_neighbors' if isinstance(num_pc, int): self.__model = PCA(n_components=num_pc, whiten=self.whiten) self.__transform = self.__model.fit_transform(self.data_table) self.__pcas = self.__transform elif isinstance(num_pc, list): self.__model = PCA(n_components=max(num_pc) + 1, whiten=self.whiten) self.__transform = self.__model.fit_transform(self.data_table) # get only the specific PCs self.__pcas = numpy.array( [self.__transform[:, c - 1] for c in num_pc]).T else: raise AssertionError('num_pcs must be either an integer or a list') self.__model = UMAP(n_components=2, n_neighbors=n_neighbors, metric='correlation', random_state=self.random_state, verbose=self.verbose) self.npos = self.__model.fit_transform(self.__pcas) self.trained = True
def calc_umap(X, n_components, n_neighbors, min_dist, spread, random_state): umap = UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, spread=spread, random_state=random_state) return umap.fit_transform(X)
def umap(feats, indices): metric = st.selectbox('Metric', [ 'euclidean', 'manhattan', 'chebyshev', 'minkowski', 'canberra', 'braycurtis', 'mahalanobis', 'wminkowski', 'seuclidean', 'cosine', 'correlation' ]) n_neighbors = st.slider('N Neighbors', min_value=2, max_value=200, value=15, step=1) min_dist = st.slider('Minimum Distance', min_value=0.0, max_value=1.0, value=0.1, step=0.01) model = UMAP(n_components=3, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric) results = model.fit_transform(feats[indices, :]) return results
def test_umap_transform_embedding_stability(iris, iris_selection): """Test that transforming data does not alter the learned embeddings Issue #217 describes how using transform to embed new data using a trained UMAP transformer causes the fitting embedding matrix to change in cases when the new data has the same number of rows as the original training data. """ data = iris.data[iris_selection] fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data) original_embedding = fitter.embedding_.copy() # The important point is that the new data has the same number of rows # as the original fit data new_data = np.random.random(data.shape) _ = fitter.transform(new_data) assert_array_equal( original_embedding, fitter.embedding_, "Transforming new data changed the original embeddings", ) # Example from issue #217 a = np.random.random((1000, 10)) b = np.random.random((1000, 5)) umap = UMAP() u1 = umap.fit_transform(a[:, :5]) u1_orig = u1.copy() assert_array_equal(u1_orig, umap.embedding_) _ = umap.transform(b) assert_array_equal(u1_orig, umap.embedding_)
def plot_projections(embeds, speakers, ax=None, colors=None, markers=None, legend=True, title=""): if ax is None: _, ax = plt.subplots(figsize=(6, 6)) # Compute the 2D projections. You could also project to another number of dimensions (e.g. # for a 3D plot) or use a different different dimensionality reduction like PCA or TSNE. reducer = UMAP() projs = reducer.fit_transform(embeds) # Draw the projections speakers = np.array(speakers) colors = colors or _my_colors for i, speaker in enumerate(np.unique(speakers)): speaker_projs = projs[speakers == speaker] marker = "o" if markers is None else markers[i] label = speaker if legend else None ax.scatter(*speaker_projs.T, c=[colors[i]], marker=marker, label=label) if legend: ax.legend(title="Speakers", ncol=2) ax.set_title(title) ax.set_xticks([]) ax.set_yticks([]) ax.set_aspect("equal") return projs
def calc_umap(self, df, n_neighbors=5, min_dist=0.3, metric='correlation', data_type='original_data'): print(">> Running UMAP from " + data_type + "...") tmp_drop_cols = ['Gene_Name', self.cfg.Y] X = df.drop(tmp_drop_cols, axis=1) umap = UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric) t0 = time() X_umap = umap.fit_transform(X) total_time = time() - t0 X_umap = pd.DataFrame(X_umap) X_umap.columns = [('d' + str(c)) for c in X_umap.columns.values] #print(X_umap) X_umap = pd.concat([X_umap, df[tmp_drop_cols]], axis=1) filepath = str(self.cfg.unsuperv_out / ("UMAP" + data_type + ".tsv")) X_umap.to_csv(filepath, sep='\t', index=None) return X_umap, total_time
def embeddingUmap(n_components, n_neighbors, random_state, tfidf_matrix_fit, tfidf_matrix_transform): umap = UMAP(n_components=n_components, n_neighbors=n_neighbors, random_state=random_state).fit(tfidf_matrix_fit) print("reducing vector's dimensionality...") umap_embedding = umap.fit_transform(tfidf_matrix_transform) umap_df = pd.DataFrame(umap_embedding, columns=[f'emb_{i + 1}' for i in range(n_components)]) return umap_df, umap_embedding
def dim_red_kmeans(data, cluster, technique): if cluster == 'renda': features = data.loc[:, 'gdp_per_capita':] else: features = data.loc[:, 'cardiovasc_death_rate':] if technique == 'umap': umap_2d = UMAP(n_components=2, init='random', random_state=0) proj_2d = umap_2d.fit_transform(features) elif technique == 'pca': pca = PCA(n_components=2, random_state=0) proj_2d = pca.fit(features).transform(features) else: tsne = TSNE(n_components=2, random_state=0) proj_2d = tsne.fit_transform(features) kmeans = KMeans(n_clusters=7, init="k-means++", max_iter=500, n_init=10, random_state=123) identified_clusters = kmeans.fit_predict(proj_2d) data['Cluster'] = identified_clusters return px.scatter(proj_2d, x=0, y=1, color=data.Cluster, labels={'color': 'Cluster'}, hover_name=data.location)
class UMAPAnalyzer(BaseAnalyzer): """ UMAP analysis for features. """ def compute( self, n_neighbors=100, n_components=2, min_dist=0.5, metric="euclidean", verbose=True, n_epochs=1000, **kwargs, ): self.model = UMAP( n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist, metric=metric, verbose=True, n_epochs=n_epochs, **kwargs, ) embedding = self.model.fit_transform(self.features) self.embedding = embedding return self.embedding
def plot_projections(embeds, speakers, ax=None, colors=None, markers=None, legend=True, title="", **kwargs): if ax is None: _, ax = plt.subplots(figsize=(6, 6)) reducer = UMAP(**kwargs) projs = reducer.fit_transform(embeds) speakers = np.array(speakers) colors = colors or _embedding_colors_ for i, speaker in enumerate(np.unique(speakers)): speaker_projs = projs[speakers == speaker] marker = "o" if markers is None else markers[i] label = speaker if legend else None ax.scatter(*speaker_projs.T, c=[colors[i]], marker=marker, label=label) ax.set_title(title) ax.set_xticks([]) ax.set_yticks([]) ax.set_aspect("equal") plt.legend(loc="center left", bbox_to_anchor=(1, 0.5)) plt.show() return projs
def update_figure(selected_dataset): if selected_dataset == "MNIST-Digits": X = pd.read_csv( "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/mnist-1000-input.csv" ) y = pd.read_csv( "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/mnist-1000-labels.csv" ) y = np.unique(y, return_inverse=True)[1] elif selected_dataset == "MNIST-Fashion": X = pd.read_csv( "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/fashion-1000-input.csv" ) y = pd.read_csv( "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/fashion-1000-labels.csv" ) y = np.unique(y, return_inverse=True)[1] else: return None, "Please select a dataset." umap_3d = UMAP(n_components=3, init="random", random_state=0) proj_3d = umap_3d.fit_transform(X, y=y) fig = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=y) fig.update_layout(transition_duration=500, height=1000) fig.update(layout_coloraxis_showscale=False) fig.update_traces(marker_size=2) return fig
class UMAP: def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = UMAP_(*args, **kwargs) def fit(self, X, y): pass def predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print( 'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: predicted = self.model.fit_transform(X_) Z = numpy.full(shape=(X.shape[0], predicted.shape[1]), fill_value=numpy.nan, dtype=numpy.float64) Z[nan_mask, :] = predicted return Z
def project_umap(spk_dict: Dict[str, Tensor], seed): sorted_speakers = sorted(list(spk_dict.keys())) flat_embs = torch.cat([spk_dict[k] for k in sorted_speakers], dim=0).numpy() try: from umap import UMAP from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt except ModuleNotFoundError: raise ModuleNotFoundError( 'Please install umap, sklearn, and matplotlib from pypi to plot umap results.' ) data = StandardScaler().fit_transform(flat_embs) reducer = UMAP(metric='cosine', verbose=True, n_neighbors=20, random_state=seed) reduced_data = reducer.fit_transform(data) print(reduced_data.shape) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 9)) reduced_chunks = torch.from_numpy(reduced_data).chunk(len(spk_dict), dim=0) for s, c in zip(sorted_speakers, reduced_chunks): ax.scatter(c.numpy()[:, 0], c.numpy()[:, 1]) ax.legend(sorted_speakers) ax.set_xlabel('umap 1st component') ax.set_ylabel('umap 2nd component') ax.set_title("2D umap projection with n_neighbors=20") ax.grid(True) plt.tight_layout() plt.savefig('umap_plot.svg') print("Saved umap plot to umap_plot.svg")
def run_umap(dist, logger=None, labels=None, **kwargs): """ Run MDS on distances produced by tree2dmat Args: dist (str): A distance matrix, square or condensed form n_components (int): number of components to produce metric (bool): Whether or not to run metric MDS. default is to run non-metric logger (Logger): Logger to use. default is no logging Return: emb (np.array): the MDS embedding """ if len(dist.shape) == 1: if logger is not None: logger.info('computing squareform') dist = _squareform(dist) kwargs.setdefault('n_neighbors', 100) kwargs.setdefault('n_components', 2) if logger is not None: logger.info( 'computing {n_components} components with UMAP'.format(**kwargs)) logger.info( 'using {n_neighbors} neighbors and {min_dist} min_dist'.format( **kwargs)) kwargs['verbose'] = True umap = UMAP(**kwargs) emb = umap.fit_transform(dist, y=labels) return emb
def vanDongenSpectral(args): neighbors, min_d, components, metric, dataset, scaler, k = args print(dataset + ', ' + metric + ', ' + scaler + ', n_components=' + str(components) + ', n_neighbors=' + str(neighbors) + ', min_dist=' + str(min_d) + ', k=' + str(k)) # Se estandariza usando el scaler correspondiente df = scalers[scaler].fit_transform(datasets[dataset]) # Se aplica UMAP um = UMAP(n_components=components, n_neighbors=neighbors, min_dist=min_d, metric=metric) embedding = um.fit_transform(df) # Se aplica KMeans al embedding km = KMeans(n_clusters=k, random_state=0).fit(embedding) # Se calcula la matriz de confusion tmp = pd.DataFrame({'Generos': metadata.genre, 'data': km.labels_}) ct = pd.crosstab(tmp['Generos'], tmp['data']) return vanDongen(ct)
def umapper(embed, metric="euclidean", n_neighbors=30, min_dist=1, **kws): umap = UMAP(metric=metric, n_neighbors=n_neighbors, min_dist=min_dist) umap_euc = umap.fit_transform(embed) plot_df = pd.DataFrame(data=umap_euc) plot_df["labels"] = labels fig, ax = plt.subplots(1, 1, figsize=(10, 10)) plot_kws = dict( x=0, y=1, hue="labels", palette=CLASS_COLOR_DICT, legend=False, s=20, linewidth=0.5, alpha=0.7, ) sns.scatterplot(data=plot_df, ax=ax, **plot_kws) ax.axis("off") left_right_indexing = True if left_right_indexing: tlp_inds = np.arange(len(embed) // 2) trp_inds = np.arange(len(embed) // 2) + len(embed) // 2 add_connections( plot_df.iloc[tlp_inds, 0], plot_df.iloc[trp_inds, 0], plot_df.iloc[tlp_inds, 1], plot_df.iloc[trp_inds, 1], ax=ax, ) return fig, ax
def umap(data, labels=None, ax=None, **kwargs): '''Draw a UMAP embedding plot of the data. :param matrix data: Input data. Numpy array recommended. :param list labels: (Optional) Corresponding labels to each datum. If specified, data points in the plot will be colored according to the label. :param axis ax: (Optional) Matplotlib axis to draw the plot on. :param kwargs: Any other keyword arguments will be passed onto matplotlib.pyplot.scatter. ''' # Apply UMAP and get embeddings. reducer = UMAP() embeddings = reducer.fit_transform(data) if labels is None: ax.scatter(x=embeddings[:, 0], y=embeddings[:, 1], **kwargs) else: # If labels are attached, color them in different colors labels = np.array(labels) for label in set(labels): toDraw = (labels == label) # only draw these points this time ax.scatter(x=embeddings[toDraw, 0], y=embeddings[toDraw, 1], label=label, **kwargs) ax.legend(loc='best') return ax
def main(dataset): adata = getdata(dataset) def saveplot(coords, dimred): plt.figure() plt.scatter( coords[:, 0], coords[:, 1], s=2, c=adataproj.obs["y"].values % 9, cmap="Set1", ) plt.tick_params( axis="both", which="both", bottom=False, labelbottom=False, left=False, labelleft=False, ) plt.savefig( f"figures/dimred/{dataset}_{alg}_{n_markers}markers_{dimred}.pdf", format="pdf", ) plt.savefig( f"figures/dimred/{dataset}_{alg}_{n_markers}markers_{dimred}.png", format="png", ) plt.close() for alg in [ "cife", "bincife", "jmi", "binmim", "logreg", "t-test_overestim_var", "wilcoxon", ]: markers = np.load( f"output/{dataset}_{alg}_markers_full.npz")["markers"] if len(markers.shape) > 1: markers = markers[:, 0].flatten() else: markers = markers[:10] n_markers = len(markers) adataproj = adata[:, markers].copy() plotprep(adataproj) print("Computing PCA coords") Xpca = pr.plot.pca(adataproj.X, 2, return_info=False) saveplot(Xpca, "pca") print("Computing tSNE coords") t = TSNE() Xtsne = t.fit_transform(adataproj.X.toarray()) saveplot(Xtsne, "tsne") print("Computing UMAP coords") u = UMAP() Xumap = u.fit_transform(adataproj.X) saveplot(Xumap, "umap")
def umapfigure(adata, **scatterkwargs): if "X_umap" not in adata.obsm_keys(): if "X_pca" not in adata.obsm_keys( ) or adata.obsm["X_pca"].shape[1] < 30: pca(adata, 30, zero_center=not scipy.sparse.issparse(adata.X)) umap = UMAP() adata.obsm["X_umap"] = umap.fit_transform(adata.obsm["X_pca"][:, :30]) return genericplot(adata, adata.obsm["X_umap"], **scatterkwargs)
def get(self, x, labels, clu, eval): umap = UMAP(**self.kwargs) self.logger.info("Finding embeddings.") emb = umap.fit_transform(x, y=labels) new_labels = clu.get(emb, eval) ind = np.where(labels != -1) new_labels[ind] = labels[ind] return new_labels
class UMAP_Preprocessed: def __init__(self, *args, **kwargs): self.preprocessor = UMAP(n_neighbors=30, min_dist=0, n_components=2) self.clusterer = None def fit_predict(self, X): X = self.preprocessor.fit_transform(X) return self.clusterer.fit_predict(X)
def plot_UMAP(self, features): umap_2d = UMAP(n_components=2, init='random', random_state=0) print("Computing projections...") proj_2d = umap_2d.fit_transform(features) print("Plotting...") sns.scatterplot(data=proj_2d) plt.grid(True) plt.show()
def umap_reduce(data, **kwargs): try: reducer = cumlUMAP(**kwargs) embedding = reducer.fit_transform(data) except (RuntimeError, TypeError) as e: warnings.warn(e) reducer = UMAP(**kwargs) embedding = reducer.fit_transform(data) return embedding, reducer
def embed_umap(data): """data should be on cpu, numpy""" embedding = UMAP( metric='euclidean', n_neighbors=40, # angular_rp_forest=True, # random_state=torch.initial_seed(), transform_seed=torch.initial_seed()) return embedding.fit_transform(data)
def umapDataReductionTo2D(self): """ UMAP - Uniform Manifold Approximation and Projection method to used to reduce the dimensionality of Target/Reference vectors to 2-D using Multi Dimension Scaling (MDS) :return: None """ umap = UMAP(n_components=2, random_state=1) reduced_feature_matrix = umap.fit_transform(self.__vectorized_corpus) self.__reduced_dim_feature_data = FeatureMatrixData( reduced_feature_matrix, self.__document_ids)
def reduceWithUMAP(vectors, size): log(f'Reducing data to {size} features using UMAP (slow-ish)') umap = UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean', n_components=size) vecs = umap.fit_transform(vectors) return vecs
def get_umap_projection(**kwargs): '''Get the x,y positions of images passed through a umap projection''' print(' * creating UMAP layout') out_path = get_path('layouts', 'umap', **kwargs) if os.path.exists(out_path) and kwargs['use_cache']: return out_path model = UMAP(n_neighbors=kwargs['n_neighbors'], min_dist=kwargs['min_dist'], metric=kwargs['metric']) z = model.fit_transform(kwargs['vecs']) return write_layout(out_path, z, **kwargs)
def on_epoch_begin(self, model): print( f"\n----------------\n\nEnd of epoch {self.epoch}. Getting scores..." ) scores = defaultdict(list) scores["epoch"] = self.epoch for df, seed in test_data: print(f"Vectorize...") docvecs = df["text"].progress_apply(lambda x: simple_preprocess(x)) docvecs = docvecs.progress_apply(lambda x: model.infer_vector(x)) print(f"Reduce dimensions...") dim_reducer = UMAP(metric="cosine", set_op_mix_ratio=1.0, n_components=256, random_state=42) dim_reduced_vecs = dim_reducer.fit_transform(list(docvecs)) print(f"Run ivis...") dim_reducer = Ivis(embedding_dims=1, k=15, model="maaten", n_epochs_without_progress=10, verbose=0) decision_scores = dim_reducer.fit_transform(dim_reduced_vecs) decision_scores = decision_scores.astype(float) print(f"Get and save scores...") preds = reject_outliers(decision_scores, iq_range=1.0 - contamination) preds = [-1 if x else 1 for x in preds] scores = get_scores(scores, df["outlier_label"], preds) scores["seed"] = seed print( f"Scores for epoch {self.epoch} | seed - {seed}:\n{pd.DataFrame(scores, index=[0])}" ) self.result_df = self.result_df.append(scores, ignore_index=True) self.result_df.to_csv(self.log_path, sep="\t") self.epoch += 1
class TUmap(Transform): """ n_neighbors: This determines the number of neighboring points used in local approximations of manifold structure. Larger values will result in more global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50, with a choice of 10 to 15 being a sensible default. min_dist: This controls how tightly the embedding is allowed compress points together. Larger values ensure embedded points are more evenly distributed, while smaller values allow the algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5, with 0.1 being a reasonable default. metric: This determines the choice of metric used to measure distance in the input space. A wide variety of metrics are already coded, and a user defined function can be passed as long as it has been JITd by numba. """ def __init__( self, n_neighbors=15, min_dist=0.1, metric="euclidean", n_components=2, spread=1.0, random_state=None ): self._inst = UMAP( n_neighbors = n_neighbors, min_dist = min_dist, metric = metric, n_components=n_components, spread=spread, ) def transform(self, fp): x = FeaturePool(fp).array() logger.info("TUmap: starting UMAP transform ...") x_emb = self._inst.fit_transform(x) logger.info("TUamp: Done") for f_id in range(x_emb.shape[1]): yield Feature( "UMAP feature #{}".format(f_id), x_emb[:, f_id] ) @staticmethod def plot_embedding(efp: FeaturePool, split_by=None): x = efp.array() assert x.shape[1] == 2, "Embedding is expected to be with the size 2 to plot, got {}".format(x.shape[1]) fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(111) if split_by is not None: d = split_by.data ax.scatter(x[:, 0], x[:, 1], c=d, alpha=0.5) else: ax.scatter(x[:, 0], x[:, 1], alpha=0.5) if split_by is not None: ax.set_title( "UMAP for a feature pool splitted by feature `{}`".format(split_by.name) ) else: ax.set_title( "UMAP for a feature pool" ) fig.show()
from datetime import datetime from util import getKaggleMNIST from sklearn.linear_model import LogisticRegression from umap import UMAP # get the data Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST() print("Score without transformation:") model = LogisticRegression() model.fit(Xtrain, Ytrain) print(model.score(Xtrain, Ytrain)) print(model.score(Xtest, Ytest)) umapper = UMAP(n_neighbors=5, n_components=10) t0 = datetime.now() Ztrain = umapper.fit_transform(Xtrain) print("umap fit_transform took:", datetime.now() - t0) t0 = datetime.now() Ztest = umapper.transform(Xtest) print("umap transform took:", datetime.now() - t0) print("Score with transformation") model = LogisticRegression() t0 = datetime.now() model.fit(Ztrain, Ytrain) print("logistic regression fit took:", datetime.now() - t0) print(model.score(Ztrain, Ytrain)) print(model.score(Ztest, Ytest))