def test_umap_data_formats(input_type, should_downcast, nrows, n_feats, name): dtype = np.float32 if not should_downcast else np.float64 n_samples = nrows n_feats = n_feats if name == 'digits': # use the digits dataset for unit test digits = datasets.load_digits(n_class=9) X = digits["data"].astype(dtype) else: X, y = datasets.make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) umap = cuUMAP(n_neighbors=3, n_components=2, verbose=False) if input_type == 'dataframe': X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_pd) embeds = umap.fit_transform(X_cudf, convert_dtype=True) assert type(embeds) == cudf.DataFrame else: embeds = umap.fit_transform(X) assert type(embeds) == np.ndarray
def test_umap_data_formats(input_type, should_downcast, nrows, n_feats, name): dtype = np.float32 if not should_downcast else np.float64 n_samples = nrows n_feats = n_feats if name == 'digits': # use the digits dataset for unit test digits = datasets.load_digits(n_class=9) X = digits["data"].astype(dtype) else: X, y = datasets.make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) umap = cuUMAP(n_neighbors=3, n_components=2, verbose=False) embeds = umap.fit_transform(X) assert type(embeds) == np.ndarray
if enable_gpu: kmeans_float = cuml.KMeans(n_clusters=n_clusters) else: kmeans_float = sklearn.cluster.KMeans(n_clusters=n_clusters) kmeans_float.fit(df_fingerprints) print('Runtime Kmeans time (hh:mm:ss.ms) {}'.format(datetime.now() - task_start_time)) # UMAP task_start_time = datetime.now() if enable_gpu: umap = cuml.UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0) else: umap = umap.UMAP() Xt = umap.fit_transform(df_fingerprints) print('Runtime UMAP time (hh:mm:ss.ms) {}'.format(datetime.now() - task_start_time)) if enable_gpu: df_fingerprints.add_column('x', Xt[0].to_array()) df_fingerprints.add_column('y', Xt[1].to_array()) df_fingerprints.add_column('cluster', kmeans_float.labels_) else: df_fingerprints['x'] = Xt[:, 0] df_fingerprints['y'] = Xt[:, 1] df_fingerprints['cluster'] = kmeans_float.labels_ # start dash v = chemvisualize.ChemVisualization(df_fingerprints.copy(), n_clusters,
def visualize_features(classes, problem_type, curdir, default_features, balance_data, test_size): # make features into label encoder here features, feature_labels, class_labels = get_features( classes, problem_type, default_features, balance_data) # now preprocess features for all the other plots os.chdir(curdir) le = preprocessing.LabelEncoder() le.fit(class_labels) tclass_labels = le.transform(class_labels) # process features to help with clustering se = preprocessing.StandardScaler() t_features = se.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, tclass_labels, test_size=test_size, random_state=42) # print(len(features)) # print(len(feature_labels)) # print(len(class_labels)) # print(class_labels) # GET TRAINING DATA DURING MODELING PROCESS ################################## # get filename # csvfile='' # print(classes) # for i in range(len(classes)): # csvfile=csvfile+classes[i]+'_' # get training and testing data for later # try: # print('loading training files...') # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv') # y_train=X_train['class_'] # X_train.drop(['class_'], axis=1) # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv') # y_test=X_test['class_'] # X_test.drop(['class_'], axis=1) # y_train=le.inverse_transform(y_train) # y_test=le.inverse_transform(y_test) # except: # print('error loading in training files, making new test data') # Visualize each class (quick plot) ################################## visualization_dir = 'visualization_session' try: os.mkdir(visualization_dir) os.chdir(visualization_dir) except: shutil.rmtree(visualization_dir) os.mkdir(visualization_dir) os.chdir(visualization_dir) objects = tuple(set(class_labels)) y_pos = np.arange(len(objects)) performance = list() for i in range(len(objects)): performance.append(class_labels.count(objects[i])) plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.xticks(rotation=90) plt.title('Counts per class') plt.ylabel('Count') plt.xlabel('Class') plt.tight_layout() plt.savefig('classes.png') plt.close() # set current directory curdir = os.getcwd() # ################################## # # CLUSTERING!!! # ################################## ################################## # Manifold type options ################################## ''' "lle" Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures. "ltsa" LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances. "hessian" Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood "modified" Modified LLE applies a regularization parameter to LLE. "isomap" Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance. "mds" MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding. "spectral" Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation. "tsne" (default) t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding. ''' os.mkdir('clustering') os.chdir('clustering') # tSNE plt.figure() viz = Manifold(manifold="tsne", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="tsne.png") plt.close() # os.system('open tsne.png') # viz.show() # PCA plt.figure() visualizer = PCADecomposition(scale=True, classes=set(classes)) visualizer.fit_transform(np.array(features), tclass_labels) visualizer.poof(outpath="pca.png") plt.close() # os.system('open pca.png') # spectral embedding plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # lle embedding plt.figure() viz = Manifold(manifold="lle", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="lle.png") plt.close() # ltsa # plt.figure() # viz = Manifold(manifold="ltsa", classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="ltsa.png") # plt.close() # hessian # plt.figure() # viz = Manifold(manifold="hessian", method='dense', classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="hessian.png") # plt.close() # modified plt.figure() viz = Manifold(manifold="modified", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="modified.png") plt.close() # isomap plt.figure() viz = Manifold(manifold="isomap", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="isomap.png") plt.close() # mds plt.figure() viz = Manifold(manifold="mds", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="mds.png") plt.close() # spectral plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # UMAP embedding plt.figure() umap = UMAPVisualizer(metric='cosine', classes=set(classes), title="UMAP embedding") umap.fit_transform(np.array(features), class_labels) umap.poof(outpath="umap.png") plt.close() # alternative UMAP # import umap.plot # plt.figure() # mapper = umap.UMAP().fit(np.array(features)) # fig=umap.plot.points(mapper, labels=np.array(tclass_labels)) # fig = fig.get_figure() # fig.tight_layout() # fig.savefig('umap2.png') # plt.close(fig) ################################# # FEATURE RANKING!! ################################# os.chdir(curdir) os.mkdir('feature_ranking') os.chdir('feature_ranking') # You can get the feature importance of each feature of your dataset # by using the feature importance property of the model. plt.figure(figsize=(12, 12)) model = ExtraTreesClassifier() model.fit(np.array(features), tclass_labels) # print(model.feature_importances_) feat_importances = pd.Series(model.feature_importances_, index=feature_labels[0]) feat_importances.nlargest(20).plot(kind='barh') plt.title('Feature importances (ExtraTrees)', size=16) plt.title('Feature importances with %s features' % (str(len(features[0])))) plt.tight_layout() plt.savefig('feature_importance.png') plt.close() # os.system('open feature_importance.png') # get selected labels for top 20 features selectedlabels = list(dict(feat_importances.nlargest(20))) new_features, new_labels = restructure_features(selectedlabels, t_features, feature_labels[0]) new_features_, new_labels_ = restructure_features(selectedlabels, features, feature_labels[0]) # Shapiro rank algorithm (1D) plt.figure(figsize=(28, 12)) visualizer = Rank1D(algorithm='shapiro', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) # plt.tight_layout() visualizer.poof(outpath="shapiro.png") plt.title('Shapiro plot (top 20 features)', size=16) plt.close() # os.system('open shapiro.png') # visualizer.show() # pearson ranking algorithm (2D) plt.figure(figsize=(12, 12)) visualizer = Rank2D(algorithm='pearson', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) plt.tight_layout() visualizer.poof(outpath="pearson.png") plt.title('Pearson ranking plot (top 20 features)', size=16) plt.close() # os.system('open pearson.png') # visualizer.show() # feature importances with top 20 features for Lasso plt.figure(figsize=(12, 12)) viz = FeatureImportances(Lasso(), labels=new_labels_) viz.fit(np.array(new_features_), tclass_labels) plt.tight_layout() viz.poof(outpath="lasso.png") plt.close() # correlation plots with feature removal if corr > 0.90 # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf # now remove correlated features # --> p values # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix # --> https://seaborn.pydata.org/tutorial/distributions.html data = new_features corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap with correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap.png') plt.close(fig) columns = np.full((corr.shape[0], ), True, dtype=bool) for i in range(corr.shape[0]): for j in range(i + 1, corr.shape[0]): if corr.iloc[i, j] >= 0.9: if columns[j]: columns[j] = False selected_columns = data.columns[columns] data = data[selected_columns] corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap without correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap_clean.png') plt.close(fig) # radviz # Instantiate the visualizer plt.figure(figsize=(12, 12)) visualizer = RadViz(classes=classes, features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) visualizer.poof(outpath="radviz.png") visualizer.show() plt.close() # feature correlation plot plt.figure(figsize=(28, 12)) visualizer = feature_correlation(np.array(new_features), tclass_labels, labels=new_labels) visualizer.poof(outpath="correlation.png") visualizer.show() plt.tight_layout() plt.close() os.mkdir('feature_plots') os.chdir('feature_plots') newdata = new_features_ newdata['classes'] = class_labels for j in range(len(new_labels_)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels_[j])) plt.close(fig) os.mkdir('feature_plots_transformed') os.chdir('feature_plots_transformed') newdata = new_features newdata['classes'] = class_labels for j in range(len(new_labels)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels[j])) plt.close(fig) ################################################## # PRECISION-RECALL CURVES ################################################## os.chdir(curdir) os.mkdir('model_selection') os.chdir('model_selection') plt.figure() visualizer = precision_recall_curve(GaussianNB(), np.array(features), tclass_labels) visualizer.poof(outpath="precision-recall.png") plt.close() plt.figure() visualizer = roc_auc(LogisticRegression(), np.array(features), tclass_labels) visualizer.poof(outpath="roc_curve_train.png") plt.close() plt.figure() visualizer = discrimination_threshold( LogisticRegression(multi_class="auto", solver="liblinear"), np.array(features), tclass_labels) visualizer.poof(outpath="thresholds.png") plt.close() plt.figure() visualizer = residuals_plot(Ridge(), np.array(features), tclass_labels, train_color="maroon", test_color="gold") visualizer.poof(outpath="residuals.png") plt.close() plt.figure() visualizer = prediction_error(Lasso(), np.array(features), tclass_labels) visualizer.poof(outpath='prediction_error.png') plt.close() # outlier detection plt.figure() visualizer = cooks_distance(np.array(features), tclass_labels, draw_threshold=True, linefmt="C0-", markerfmt=",") visualizer.poof(outpath='outliers.png') plt.close() # cluster numbers plt.figure() visualizer = silhouette_visualizer( KMeans(len(set(tclass_labels)), random_state=42), np.array(features)) visualizer.poof(outpath='siloutte.png') plt.close() # cluster distance plt.figure() visualizer = intercluster_distance( KMeans(len(set(tclass_labels)), random_state=777), np.array(features)) visualizer.poof(outpath='cluster_distance.png') plt.close() # plot percentile of features plot with SVM to see which percentile for features is optimal features = preprocessing.MinMaxScaler().fit_transform(features) clf = Pipeline([('anova', SelectPercentile(chi2)), ('scaler', StandardScaler()), ('logr', LogisticRegression())]) score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, np.array(features), class_labels) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Performance of the LogisticRegression-Anova varying the percent features selected' ) plt.xticks(np.linspace(0, 100, 11, endpoint=True)) plt.xlabel('Percentile') plt.ylabel('Accuracy Score') plt.axis('tight') plt.savefig('logr_percentile_plot.png') plt.close() # get PCA pca = PCA(random_state=1) pca.fit(X_train) skplt.decomposition.plot_pca_component_variance(pca) plt.savefig('pca_explained_variance.png') plt.close() # estimators rf = RandomForestClassifier() skplt.estimators.plot_learning_curve(rf, X_train, y_train) plt.title('Learning Curve (Random Forest)') plt.savefig('learning_curve.png') plt.close() # elbow plot kmeans = KMeans(random_state=1) skplt.cluster.plot_elbow_curve(kmeans, X_train, cluster_ranges=range(1, 30), title='Elbow plot (KMeans clustering)') plt.savefig('elbow.png') plt.close() # KS statistic (only if 2 classes) lr = LogisticRegression() lr = lr.fit(X_train, y_train) y_probas = lr.predict_proba(X_test) skplt.metrics.plot_ks_statistic(y_test, y_probas) plt.savefig('ks.png') plt.close() # precision-recall nb = GaussianNB() nb.fit(X_train, y_train) y_probas = nb.predict_proba(X_test) skplt.metrics.plot_precision_recall(y_test, y_probas) plt.tight_layout() plt.savefig('precision-recall.png') plt.close() ## plot calibration curve rf = RandomForestClassifier() lr = LogisticRegression() nb = GaussianNB() svm = LinearSVC() dt = DecisionTreeClassifier(random_state=0) ab = AdaBoostClassifier(n_estimators=100) gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) knn = KNeighborsClassifier(n_neighbors=7) rf_probas = rf.fit(X_train, y_train).predict_proba(X_test) lr_probas = lr.fit(X_train, y_train).predict_proba(X_test) nb_probas = nb.fit(X_train, y_train).predict_proba(X_test) # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test) dt_scores = dt.fit(X_train, y_train).predict_proba(X_test) ab_scores = ab.fit(X_train, y_train).predict_proba(X_test) gb_scores = gb.fit(X_train, y_train).predict_proba(X_test) knn_scores = knn.fit(X_train, y_train).predict_proba(X_test) probas_list = [ rf_probas, lr_probas, nb_probas, # svm_scores, dt_scores, ab_scores, gb_scores, knn_scores ] clf_names = [ 'Random Forest', 'Logistic Regression', 'Gaussian NB', # 'SVM', 'Decision Tree', 'Adaboost', 'Gradient Boost', 'KNN' ] skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names) plt.savefig('calibration.png') plt.tight_layout() plt.close() # pick classifier type by ROC (without optimization) probs = [ rf_probas[:, 1], lr_probas[:, 1], nb_probas[:, 1], # svm_scores[:, 1], dt_scores[:, 1], ab_scores[:, 1], gb_scores[:, 1], knn_scores[:, 1] ] plot_roc_curve(y_test, probs, clf_names) # more elaborate ROC example with CV = 5 fold # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py os.chdir(curdir) return ''
def umap( adata: AnnData, min_dist: float = 0.5, spread: float = 1.0, n_components: int = 2, maxiter: Optional[int] = None, alpha: float = 1.0, gamma: float = 1.0, negative_sample_rate: int = 5, init_pos: Union[_InitPos, np.ndarray, None] = 'spectral', random_state: AnyRandom = 0, a: Optional[float] = None, b: Optional[float] = None, copy: bool = False, method: Literal['umap', 'rapids'] = 'umap', neighbors_key: Optional[str] = None, ) -> Optional[AnnData]: """\ Embed the neighborhood graph using UMAP [McInnes18]_. UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of `umap-learn <https://github.com/lmcinnes/umap>`__ [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint <https://doi.org/10.1101/298430>`__. Parameters ---------- adata Annotated data matrix. min_dist The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. The default of in the `umap-learn` package is 0.1. spread The effective scale of embedded points. In combination with `min_dist` this determines how clustered/clumped the embedded points are. n_components The number of dimensions of the embedding. maxiter The number of iterations (epochs) of the optimization. Called `n_epochs` in the original UMAP. alpha The initial learning rate for the embedding optimization. gamma Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples. negative_sample_rate The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding. init_pos How to initialize the low dimensional embedding. Called `init` in the original UMAP. Options are: * Any key for `adata.obsm`. * 'paga': positions from :func:`~scanpy.pl.paga`. * 'spectral': use a spectral embedding of the graph. * 'random': assign initial embedding positions at random. * A numpy array of initial embedding positions. random_state If `int`, `random_state` is the seed used by the random number generator; If `RandomState` or `Generator`, `random_state` is the random number generator; If `None`, the random number generator is the `RandomState` instance used by `np.random`. a More specific parameters controlling the embedding. If `None` these values are set automatically as determined by `min_dist` and `spread`. b More specific parameters controlling the embedding. If `None` these values are set automatically as determined by `min_dist` and `spread`. copy Return a copy instead of writing to adata. method Use the original 'umap' implementation, or 'rapids' (experimental, GPU only) neighbors_key If not specified, umap looks .uns['neighbors'] for neighbors settings and .obsp['connectivities'] for connectivities (default storage places for pp.neighbors). If specified, umap looks .uns[neighbors_key] for neighbors settings and .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. **X_umap** : `adata.obsm` field UMAP coordinates of data. """ adata = adata.copy() if copy else adata if neighbors_key is None: neighbors_key = 'neighbors' if neighbors_key not in adata.uns: raise ValueError( f'Did not find .uns["{neighbors_key}"]. Run `sc.pp.neighbors` first.' ) start = logg.info('computing UMAP') neighbors = NeighborsView(adata, neighbors_key) if 'params' not in neighbors or neighbors['params']['method'] != 'umap': logg.warning( f'.obsp["{neighbors["connectivities_key"]}"] have not been computed using umap' ) # Compat for umap 0.4 -> 0.5 with warnings.catch_warnings(): # umap 0.5.0 warnings.filterwarnings("ignore", message=r"Tensorflow not installed") import umap if version.parse(umap.__version__) >= version.parse("0.5.0"): def simplicial_set_embedding(*args, **kwargs): from umap.umap_ import simplicial_set_embedding X_umap, _ = simplicial_set_embedding( *args, densmap=False, densmap_kwds={}, output_dens=False, **kwargs, ) return X_umap else: from umap.umap_ import simplicial_set_embedding from umap.umap_ import find_ab_params if a is None or b is None: a, b = find_ab_params(spread, min_dist) else: a = a b = b adata.uns['umap'] = {'params': {'a': a, 'b': b}} if isinstance(init_pos, str) and init_pos in adata.obsm.keys(): init_coords = adata.obsm[init_pos] elif isinstance(init_pos, str) and init_pos == 'paga': init_coords = get_init_pos_from_paga( adata, random_state=random_state, neighbors_key=neighbors_key ) else: init_coords = init_pos # Let umap handle it if hasattr(init_coords, "dtype"): init_coords = check_array(init_coords, dtype=np.float32, accept_sparse=False) if random_state != 0: adata.uns['umap']['params']['random_state'] = random_state random_state = check_random_state(random_state) neigh_params = neighbors['params'] X = _choose_representation( adata, neigh_params.get('use_rep', None), neigh_params.get('n_pcs', None), silent=True, ) if method == 'umap': # the data matrix X is really only used for determining the number of connected components # for the init condition in the UMAP embedding n_epochs = 0 if maxiter is None else maxiter X_umap = simplicial_set_embedding( X, neighbors['connectivities'].tocoo(), n_components, alpha, a, b, gamma, negative_sample_rate, n_epochs, init_coords, random_state, neigh_params.get('metric', 'euclidean'), neigh_params.get('metric_kwds', {}), verbose=settings.verbosity > 3, ) elif method == 'rapids': metric = neigh_params.get('metric', 'euclidean') if metric != 'euclidean': raise ValueError( f'`sc.pp.neighbors` was called with `metric` {metric!r}, ' "but umap `method` 'rapids' only supports the 'euclidean' metric." ) from cuml import UMAP n_neighbors = neighbors['params']['n_neighbors'] n_epochs = ( 500 if maxiter is None else maxiter ) # 0 is not a valid value for rapids, unlike original umap X_contiguous = np.ascontiguousarray(X, dtype=np.float32) umap = UMAP( n_neighbors=n_neighbors, n_components=n_components, n_epochs=n_epochs, learning_rate=alpha, init=init_pos, min_dist=min_dist, spread=spread, negative_sample_rate=negative_sample_rate, a=a, b=b, verbose=settings.verbosity > 3, random_state=random_state, ) X_umap = umap.fit_transform(X_contiguous) adata.obsm['X_umap'] = X_umap # annotate samples with UMAP coordinates logg.info( ' finished', time=start, deep=('added\n' " 'X_umap', UMAP coordinates (adata.obsm)"), ) return adata if copy else None
#esto ya no podr'a ser visualizado en 2D porque tiene 3 atributos #entonces vamos a generar una proyecci'on 2D de estos datos X = dataset.iloc[:, [2, 3, 4]].values X = StandardScaler().fit_transform(X) #vamos a usar una proyeccion llamada UMAP es #reciente y mostr'o resultados buen'isimos umap = umap.UMAP(n_neighbors=3, min_dist=0.6, metric='cosine') #esta proyeccion solo va a ser usada para visualizaciones #nuestro clustering ser'a hecho en el espacio ndimensional #que en este caso n=3 X_projected = umap.fit_transform(X) plt.scatter(X_projected[:, 0], X_projected[:, 1], s=100, color="blue") plt.grid() plt.show() kmeans = KMeans(n_clusters=7, init="random", max_iter=300) # dentro de pred_y va a tener la lista de clusters resultantes # NOO es variable dependiente, es un atributo nuevo # son las etiquetas de cluster/grupo predichas labels = kmeans.fit_predict(X) plt.scatter(X_projected[:, 0], X_projected[:, 1], s=100, c=labels) plt.grid() # una opcion para mostrar nuestra malla grafica #esta es la posicion de cada centroide en el grafico # que no necesiramente esta asociado a un dato