"df_style_texture = pd.read_csv('style_AML211_VAE_Texture{}.csv',names = range(1,101))" .format(i)) array_t = df_style_texture.values df_z_mask = pd.concat([df_root_mask, df_style_mask], 1) df_z_texture = pd.concat([df_root_texture, df_style_texture], 1) ####combine the z dim from both texture and mask f = 0.7 # specify contribution from mask array_mt = np.concatenate( (array_m * f, array_t * (1 - f)), 1 ) #only get the last 60 dimensions for the shape that has the largest variation ####umap reduce dimensions to about 10 reducer2D = umap.UMAP(n_components=2, random_state=50) print('performing combined umap 2D...') umap_result2D = reducer2D.fit_transform(array_mt) #%% print('calculating louvain...') G = kneighbors_graph(array_mt, 200, mode='connectivity', include_self=True) #was 50 G1 = nx.from_scipy_sparse_matrix(G) partition = community.best_partition(G1, resolution=0.9, random_state=50) #%% #first convert all pos t cell values to int for j in ['pos', 't', 'cell']: L = list(df_root_mask[j])
dataset such as MNIST. We first pull the MNIST dataset and then use UMAP to reduce it to only 2-dimensions for easy visualisation. Note that UMAP manages to both group the individual digit classes, but also to retain the overall global structure among the different digit classes -- keeping 1 far from 0, and grouping triplets of 3,5,8 and 4,7,9 which can blend into one another in some cases. """ import umap from sklearn.datasets import fetch_mldata import matplotlib.pyplot as plt import seaborn as sns sns.set(context="paper", style="white") mnist = fetch_mldata("MNIST original") reducer = umap.UMAP(random_state=42) embedding = reducer.fit_transform(mnist.data) fig, ax = plt.subplots(figsize=(12, 10)) plt.scatter(embedding[:, 0], embedding[:, 1], c=mnist.target, cmap="Spectral", s=0.1) plt.setp(ax, xticks=[], yticks=[]) plt.title("MNIST data embedded into two dimensions by UMAP", fontsize=18) plt.show()
def generate_umap(activations): umap_ = umap.UMAP(n_neighbors=200, n_components=2, min_dist=0.5) X_2d = umap_.fit_transform(activations) X_2d -= X_2d.min(axis=0) X_2d /= X_2d.max(axis=0) return X_2d
#images = [IMS_Bcells, IMS_CD3neg_NK, IMS_debris, IMS_eos, IMS_ery, IMS_erydub, IMS_monos, IMS_neutro, IMS_Tcells] #images = pd.DataFrame(images) #all_images = pd.concat(images) frames_b_test = np.nan_to_num(frames_unknown) frames_b_test2 = np.nan_to_num(frames_unknown2) frames_b_CD3pos_NK = np.nan_to_num(frames_CD3pos_NK) all_labels_test = (np.zeros(labels_unknown)) all_labels_test2 = (np.zeros(labels_unknown2)) all_labels_CD3pos_NK = (np.zeros(labels_CD3pos_NK)) #UMAP embedding import umap reducer = umap.UMAP(n_neighbors=15) print("------------UMAP imported------------------") ############################################################################################## ################################## first embedding ############################################# #embed and time import time start = time. time() embedding = reducer.fit(frames_b, all_labels) #embedding = reducer.fit(all_frames, all_labels) #embedding = reducer.fit_transform(blood_image_new_flat) end = time. time() print("------------UMAP embedding finished, embedding time = ------------------")
def visualize( model, # type: thelper.typedefs.ModelType task, # type: thelper.typedefs.TaskType loader, # type: thelper.typedefs.LoaderType draw=False, # type: bool color_map=None, # type: Optional[Dict[int, np.ndarray]] max_samples=None, # type: Optional[int] return_meta=False, # type: Union[bool, List[AnyStr]] **kwargs): # type: (...) -> Dict[AnyStr, Any] """ Creates (and optionally displays) a 2D UMAP visualization of sample embeddings. By default, all samples from the data loader will be projected using the model and used for the visualization. If the task is related to classification, the prediction and groundtruth labels will be highlighting using various colors. If the model does not possess a ``get_embedding`` attribute, its raw output will be used for projections. Otherwise, ``get_embedding`` will be called. Args: model: the model which will be used to produce embeddings. task: the task object used to decode predictions and color samples (if possible). loader: the data loader used to get data samples to project. draw: boolean flag used to toggle internal display call on or off. color_map: map of RGB triplets used to color predictions (for classification only). max_samples: maximum number of samples to draw from the data loader. return_meta: toggles whether sample metadata should be provided as output or not. Returns: A dictionary of the visualization result (an RGB image in numpy format), a list of projected embedding coordinates, the labels of the samples, and the predictions of the samples. """ assert thelper.utils.check_installed("umap"), \ "could not import optional 3rd-party dependency 'umap-learn'; make sure you install it first!" import umap assert loader is not None and len(loader) > 0, "no available data to load" assert model is not None and isinstance(model, torch.nn.Module), "invalid model" assert task is not None and isinstance(task, thelper.tasks.Task), "invalid task" assert max_samples is None or max_samples > 0, "invalid maximum loader sample count" thelper.viz.logger.debug( "fetching data loader samples for UMAP visualization...") embeddings, labels, preds, idxs = [], [], [], [] if isinstance(task, thelper.tasks.Classification) and not task.multi_label: assert all([isinstance(n, str) for n in task.class_names]), "unexpected class name types" if not color_map: if hasattr(task, "color_map"): color_map = task.color_map else: color_map = { idx: thelper.draw.get_label_color_mapping(idx + 1) for idx in task.class_indices.values() } color_map = { idx: f"#{c[0]:02X}{c[1]:02X}{c[2]:02X}" for idx, c in color_map.items() } if isinstance(return_meta, bool): return_meta = task.meta_keys if return_meta else [] assert isinstance(return_meta, list) and all([isinstance(key, str) for key in return_meta]), \ "sample metadata keys must be provided as a list of strings" meta = {key: [] for key in return_meta} for sample_idx, sample in tqdm.tqdm(enumerate(loader), desc="extracting embeddings"): if max_samples is not None and sample_idx > max_samples: break with torch.no_grad(): input_tensor = sample[task.input_key] if task is not None and isinstance(task, thelper.tasks.Classification) and \ not task.multi_label and task.gt_key in sample: label = sample[task.gt_key] if isinstance(label, torch.Tensor): label = label.cpu().numpy() if all([isinstance(lbl, str) for lbl in label]): label = [task.class_indices[lbl] for lbl in label] pred = model(input_tensor).topk(k=1, dim=1)[1].view( input_tensor.size(0)).cpu().numpy() labels.append(label) preds.append(pred) if hasattr(model, "get_embedding"): embedding = model.get_embedding(input_tensor) else: if not thelper.viz.warned_missing_get_embedding: thelper.viz.logger.warning( "missing 'get_embedding' function in model object; will use output instead" ) thelper.viz.warned_missing_get_embedding = True embedding = model(input_tensor) if embedding.dim() > 2: # reshape to BxC embedding = embedding.view(embedding.size(0), -1) embeddings.append(embedding.cpu().numpy()) idxs.append(sample_idx) for key in return_meta: for v in sample[key]: meta[key].append(v) embeddings = np.concatenate(embeddings) if labels and preds: labels, preds = np.concatenate(labels), np.concatenate(preds) else: labels, preds = [0] * len(embeddings), [0] * len(embeddings) seed = thelper.utils.get_key_def("seed", kwargs, 0) if seed is None: seed = np.random.randint(np.iinfo(np.int32).max) prev_state = np.random.get_state() np.random.seed(seed) default_umap_args = {"n_components": 2} umap_args = thelper.utils.get_key_def("umap_args", kwargs, default_umap_args) umap_engine = umap.UMAP(**umap_args) thelper.viz.logger.debug("computing UMAP projection...") embeddings = umap_engine.fit_transform(embeddings) np.random.set_state(prev_state) fig = plot(embeddings, labels, preds, color_map=color_map, task=task, **kwargs) img = thelper.draw.fig2array(fig).copy() if draw: thelper.viz.logger.debug("displaying UMAP projection...") cv.imshow("thelper.viz.umap", img[..., ::-1]) # RGB to BGR for opencv display cv.waitKey(1) return { # key formatting should be compatible with _write_data in thelper/train/base.py "tsne-projs/pickle": embeddings, "tsne-labels/json": labels.tolist(), "tsne-preds/json": preds.tolist(), "tsne-idxs/json": idxs, "tsne-meta/json": meta, "tsne/image": img }
def main(): #select cpu or gpu device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") torch.manual_seed(16) #pre process data #annotation an = pd.read_csv( "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/data/annotation.csv", index_col=0) tmp = an["loh_percent"].copy() for i in range(len(tmp)): if tmp[i] >= 0.05: tmp[i] = 1 else: if tmp[i] < 0.05: tmp[i] = 0 an["loh_percent"] = tmp tmp2 = an["mutations_per_mb"].copy() for i in range(len(tmp2)): if tmp2[i] >= 28: tmp2[i] = 1 else: if tmp2[i] < 28: tmp2[i] = 0 an["mutations_per_mb"] = tmp2 #data x = pd.read_csv( "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/data/salmonE74cDNA_counts_baseline.csv", index_col=0) x = x.T x = (x + 1).apply(np.log2) #test = np.median(x, axis=0) x_std = np.std(x, axis=0) top_gene = runPams.n_top_gene top_gene_idx = x_std.argsort()[::-1][0:top_gene] data = x.iloc[:, top_gene_idx] data = data.values.copy() top_gene_names = list(x.columns[top_gene_idx]) top_gene_names = np.insert(top_gene_names, 0, "bias") #data = np.random.rand(10, 200) xn, yn = data.shape # umap + kmeans pams = str(runPams.k) + "_" + str(runPams.n_top_gene) pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_UMAP/" # umap reducer = umap.UMAP() z = reducer.fit_transform(data) # kmeans kmeans = KMeans(n_clusters=4, random_state=0).fit(z) imgName = "kmeans.png" myData.myDraw(z, kmeans.labels_, pathName, imgName) # Hierarchical Clustering clst = cluster.AgglomerativeClustering(n_clusters=4) imgName = "Hierarchical_Clustering.png" myData.myDraw(z, clst.fit_predict(z), pathName, imgName) for i in range(1, len(an.columns)): a = an.columns[i] imgName = str(a) + ".png" myData.myDraw(z, an[a], pathName, imgName) # TSNE pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_TSNE/" # T-sNE z = TSNE(n_components=2).fit_transform(data) # kmeans kmeans = KMeans(n_clusters=4, random_state=0).fit(z) imgName = "kmeans.png" myData.myDraw(z, kmeans.labels_, pathName, imgName) # Hierarchical Clustering clst = cluster.AgglomerativeClustering(n_clusters=4) imgName = "Hierarchical_Clustering.png" myData.myDraw(z, clst.fit_predict(z), pathName, imgName) for i in range(1, len(an.columns)): a = an.columns[i] imgName = str(a) + ".png" myData.myDraw(z, an[a], pathName, imgName) # vae+dnp #data = np.random.rand(10, 2000) #xn, yn = data.shape data = np.reshape(data, (xn, 1, yn)) data = np.insert(data, 0, 1, axis=2) #data = data[:,:,:5000] zn, xn, yn = data.shape # set s set_s = np.zeros(xn * yn) set_s[0] = 1 # set c set_c = np.ones(xn * yn) set_c[0] = 0 # np 2 tensor data = torch.tensor(data) # dataLoader dataSet = myData.MyDataset(data, data) dataLoader = DataLoader(dataset=dataSet, batch_size=runPams.batch_size, shuffle=False, num_workers=runPams.n_cpu, pin_memory=torch.cuda.is_available()) net, optimizer, lossFunc = getVAEPams(xn, yn, device, runPams.lr) #np->tensor or gpu set_s = torch.tensor(set_s).float().to(device) set_c = torch.tensor(set_c).float().to(device) # train while torch.sum(set_s == 1).item() < (runPams.k + 1): print(torch.sum(set_s == 1).item()) for _ in range(runPams.epoch): for step, (x, _) in enumerate(dataLoader): b_x = Variable(x.view(-1, xn * yn).float().to(device)) b_y = Variable(x.view(-1, xn * yn).float().to(device)) # initialize the weight of set c to be zero and of set s to be normal net.fc1.weight.data = net.fc1.weight.data * (set_s) # network _, decoded, _ = net(b_x) loss = lossFunc(decoded, b_y) # mean square error optimizer.zero_grad() # clear gradients for this training step loss.backward() # backpropagation, compute gradients optimizer.step() # apply gradients print(net.fc1.weight.grad) #get new J newJ = getNewJ(net.fc1.weight.grad.clone(), set_c, device).item() print(newJ) # initialize the weight of node J by xavier tmpWeight = torch.rand(1, net.fc1.out_features) tmpWeight = nn.init.xavier_normal_(tmpWeight) net.fc1.weight.data[:, newJ] = tmpWeight # update set s and aet C set_s[newJ] = torch.tensor(1) set_c[newJ] = torch.tensor(0) # test #sys.exit() predLabelsByVAE = list() features = list() for (x, _) in dataLoader: b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch x (data) feature, _, predicted = net(b_x) features.append([feature.cpu().detach().numpy()]) predicted = torch.max(predicted.data, 1)[1].cpu().numpy() predLabelsByVAE.append(predicted) # test end features = np.hstack(features) zn, xn, yn = features.shape features = np.reshape(features, (xn, yn)) features = np.array(features) z = features pams = str(runPams.k) + "_" + str(runPams.n_top_gene) pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_VAE+DNP/" # kmeans kmeans = KMeans(n_clusters=4, random_state=0).fit(z) imgName = "kmeans.png" myData.myDraw(z, kmeans.labels_, pathName, imgName) # Hierarchical Clustering clst = cluster.AgglomerativeClustering(n_clusters=4) imgName = "Hierarchical_Clustering.png" myData.myDraw(z, clst.fit_predict(z), pathName, imgName) for i in range(1, len(an.columns)): a = an.columns[i] imgName = str(a) + ".png" myData.myDraw(z, an[a], pathName, imgName) # save gene names pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/genes_selected.csv" genes = pd.DataFrame(set_s.cpu().detach().numpy()) genes = genes.T genes.columns = top_gene_names genes.to_csv(pathName) ''' kmeans_estimator = KMeans(n_clusters=4, random_state=0).fit(features) labelByVAEKmeans = kmeans_estimator.labels_ # get figures mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] # 这里'or'代表中的'o'代表画圈,'r'代表颜色为红色,后面的依次类推 for i in range(len(labelByVAEKmeans)): plt.plot([features[i, 0]], [features[i, 1]], mark[label_pred[i]], markersize=5) #save data pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200702AE_DNP/results/csv_img_res/" fileName = pathName + str(runPams.k) + ".png" plt.savefig(fileName) fileName = pathName + str(runPams.k) + ".csv" setS = pd.DataFrame(set_s.cpu().detach().numpy()) setS = setS.T setS.to_csv(fileName) #plt.show() ''' return ()
X_datasets.append(X_coil20) Y_datasets.append(y_coil20) X_digits, y_digits = datasets.load_digits(n_class=10, return_X_y=True) X_datasets.append(X_digits) Y_datasets.append(y_digits) dft = pd.read_csv('./data/fashion-mnist_test.csv', dtype=int) # read test data X_fashion = dft.drop('label', axis=1) y_fashion = dft['label'] X_datasets.append(X_fashion) Y_datasets.append(y_fashion) # Set up algorithms methods = OrderedDict() methods['umap'] = umap.UMAP() methods['t-SNE'] = manifold.TSNE(n_components=2, init='pca', random_state=0) fig = plt.figure(figsize=(15, 8)) # Plot results labels = ['COIL20', 'MNIST', 'FASHION MNIST'] for i, (label, method) in enumerate(methods.items()): for j in range(len(X_datasets)): print(X_datasets[j].shape) Y = method.fit_transform(X_datasets[j]) ax = fig.add_subplot(2, 3, i * 3 + j + 1) ax.scatter(Y[:, 0], Y[:, 1], c=Y_datasets[j], cmap=plt.cm.Spectral) ax.set_title(labels[j]) ax.set_ylabel(label) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter())
def do_umap(filename, bm_plot_name, env_plot_name): # get the reaction-inclusion vector out of the input file and make it into # a bunch of 1/0 columns instead of one column of strings of 1s and 0s data = pd.read_csv(filename) # want each reaction bit in its own column and only pass those columns to umap rxn_incl_cols = data['rxn_incl'].apply(lambda x: pd.Series(list(x))) # first and last columns will be empty and all columns will be strings umap_ready = rxn_incl_cols.iloc[:, 1:-1].astype('int32') # do UMAP print('Doing UMAP') reducer = umap.UMAP() umap_results = reducer.fit_transform(umap_ready) umap_df = pd.DataFrame(data=umap_results, columns=['x', 'y']) # add in other info for plotting purposes plotting_df = pd.concat([umap_df, data], axis=1) # make a colormap for biomass reactions print('Plotting') bm_cdict = {v: k for k, v in enumerate(np.unique(data.biomass))} bm_cvals = [bm_cdict[c] for c in data.biomass] # make a colormap for input metabolites # start by getting the column of input metabolites, splitting it into a list of # lists, then pasting the sublists together so that there's one string to use # for making the colormap in_groups = ['-'.join(sorted(ins.split('-'))) for ins in data.env] in_cdict = {v: k for k, v in enumerate(np.unique(in_groups))} in_cvals = [in_cdict[c] for c in in_groups] # make the figure large plt.figure(figsize=(8, 7)) # make the text legible matplotlib.rcParams.update({ 'font.size': 18, 'xtick.labelsize': 18, 'ytick.labelsize': 18, 'axes.labelsize': 18 }) # do one scatterplot colored by biomass reactions plt.scatter(plotting_df.x, plotting_df.y, c=bm_cvals, cmap='nipy_spectral', s=10) plt.xlabel('UMAP_1') plt.ylabel('UMAP_2') plt.savefig(f'data/{bm_plot_name}.png', dpi=600) # do one scatterplot colored by environments plt.figure(2) plt.figure(figsize=(8, 7)) plt.scatter(plotting_df.x, plotting_df.y, c=in_cvals, cmap='nipy_spectral', s=10) plt.xlabel('UMAP_1') plt.ylabel('UMAP_2') plt.savefig(f'data/{env_plot_name}.png', dpi=600)
def layout_umap( graph: nx.Graph, min_dist: float = 0.75, n_neighbors: int = 25, max_edges: int = 10000000, random_seed: Optional[int] = None, ) -> Tuple[nx.Graph, List[NodePosition]]: """ Automatic graph layout generation by creating a generalized node2vec embedding, then using UMAP for dimensionality reduction to 2d space. By default, this function automatically attempts to prune each graph to a maximum of 10,000,000 edges by removing the lowest weight edges. This pruning is approximate and will leave your graph with at most ``max_edges``, but is not guaranteed to be precisely ``max_edges``. In addition to pruning edges by weight, this function also only operates over the largest connected component in the graph. After dimensionality reduction, sizes are generated for each node based upon their degree centrality, and these sizes and positions are further refined by an overlap removal phase. Lastly, a global partitioning algorithm (:func:`graspologic.partition.leiden`) is executed for the largest connected component and the partition ID is included with each node position. Parameters ---------- graph : :class:`networkx.Graph` The graph to generate a layout for. This graph may have edges pruned if the count is too high and only the largest connected component will be used to automatically generate a layout. min_dist : float The effective minimum distance between embedded points. Default is ``0.75``. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. n_neighbors : int The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Default is ``25``. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. max_edges : int The maximum number of edges to use when generating the embedding. Default is ``10000000``. The edges with the lowest weights will be pruned until at most ``max_edges`` exist. Warning: this pruning is approximate and more edges than are necessary may be pruned. Running in 32 bit enviornment you will most likely need to reduce this number or you will out of memory. random_seed : int Seed to be used for reproducible results. Default is None and will produce random results. Returns ------- Tuple[nx.Graph, List[NodePosition]] The largest connected component and a list of NodePositions for each node in the largest connected component. The NodePosition object contains: - node_id - x coordinate - y coordinate - size - community References ---------- .. [1] McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018 .. [2] Böhm, Jan Niklas; Berens, Philipp; Kobak, Dmitry. A Unifying Perspective on Neighbor Embeddings along the Attraction-Repulsion Spectrum. ArXiv e-prints 2007.08902v1, 17 Jul 2020. """ lcc_graph, tensors, labels = _node2vec_for_layout(graph, max_edges, random_seed) points = umap.UMAP( min_dist=min_dist, n_neighbors=n_neighbors, random_state=random_seed ).fit_transform(tensors) positions = _node_positions_from(lcc_graph, labels, points, random_seed=random_seed) return lcc_graph, positions
import pandas as pd import numpy as np import umap from sklearn.feature_extraction.text import TfidfVectorizer covidtrials = pd.read_csv("C:\clustering\\allresults.csv") def TFIDF(X_train, MAX_NB_WORDS=75000): vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS) X_train = vectorizer_x.fit_transform(X_train).toarray() print("tf-idf with", str(np.array(X_train).shape[1]), "features") columns = vectorizer_x.get_feature_names() return (X_train, columns) Y = covidtrials['newc'] vectorizer = TfidfVectorizer(max_features=75000) X = vectorizer.fit_transform(Y) YY = covidtrials['clusters'] mapper = umap.UMAP().fit(X) import umap.plot p = umap.plot.points(mapper, labels=YY, color_key_cmap='Paired') umap.plot.plt.show()
import numpy as np from sklearn.datasets import load_iris, load_digits from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import pandas as pd from skimage.io import imread from skimage.transform import rescale, resize, pyramid_gaussian, pyramid_reduce import matplotlib.pylab as plt import umap #%% from os.path import join #%% reducer = umap.UMAP(metric="correlation") embedding = reducer.fit_transform(iris.data) embedding.shape #%% face_attr_df = pd.read_csv(r"E:\Datasets\celeba-dataset\list_attr_celeba.csv") #%% celeba_dir = r"E:\Datasets\celeba-dataset\img_align_celeba\img_align_celeba" imgs = [] for imgi in range(1, 10001): img = imread(join(celeba_dir, r"%06d.jpg" % imgi)) #print(img.shape) imgs.append(img) #%% # img_rd = rescale(img, 0.11, multichannel=True, anti_aliasing=True) # print(img_rd.shape) img_rs = resize(img, (24, 20, 3), anti_aliasing=True) print(img_rs.shape) plt.imshow(img_rs) plt.show()
subXie = Xiecluster[[ 'Soma_region', 'Brain_id', 'SWC_File', 'Celltype', 'Subtype' ]].copy() # Change Dr.Xie's Soma_region to 'Xie soma abbr' subXie.rename(columns={'Soma_region': 'Xie Soma_Abbr'}, inplace=True) subXie.index = Xiecluster['SWC_File'] subXie = subXie.reindex(index=os.listdir('/home/penglab/Documents/CLA_swc'), fill_value='0') # %% # Use umap to map data from high dimension to low dimension import umap import matplotlib.pyplot as plt import seaborn as sns reducer = umap.UMAP() embedding = reducer.fit_transform(Feafile.values) print('\n') print('Shape of the Umap result are ', embedding.shape) print('The result is an array with ' + str(embedding.shape[0]) + ' samples, but only ' + str(embedding.shape[1]) + ' feature columns (instead of the ' + str(Feafile.shape[1]) + ' we started with).') #Show the original subtype ShowXie = pd.DataFrame(index=subXie.index, columns=['ux', 'uy', 'Subtype', 'plotc']) typeR, typeC = np.unique(subXie['Subtype'], return_counts=True) ShowXie['ux'] = embedding[:, 0] ShowXie['uy'] = embedding[:, 1] ShowXie['Subtype'] = subXie['Subtype']
def eval_other_methods(x, y): gmm = mixture.GaussianMixture(covariance_type='full', n_components=args.n_clusters, random_state=0) gmm.fit(x) y_pred_prob = gmm.predict_proba(x) y_pred = y_pred_prob.argmax(1) acc = np.round(cluster_acc(y, y_pred), 5) nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) print(args.dataset + " | GMM clustering on raw data") print("======================") result = "{}\t{}\t{}".format(ari, nmi, acc) print(result) print("======================") y_pred = KMeans(n_clusters=args.n_clusters, random_state=0).fit_predict(x) acc = np.round(cluster_acc(y, y_pred), 5) nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) print(args.dataset + " | K-Means clustering on raw data") print("======================") result = "{}\t{}\t{}".format(ari, nmi, acc) print(result) print("======================") sc = SpectralClustering(n_clusters=args.n_clusters, random_state=0, affinity='nearest_neighbors') y_pred = sc.fit_predict(x) acc = np.round(cluster_acc(y, y_pred), 5) nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) print(args.dataset + " | Spectral Clustering on raw data") print("======================") result = "{}\t{}\t{}".format(ari, nmi, acc) print(result) print("======================") if args.manifold_learner == 'UMAP': md = float(args.umap_min_dist) hle = umap.UMAP(random_state=0, metric=args.umap_metric, n_components=args.umap_dim, n_neighbors=args.umap_neighbors, min_dist=md).fit_transform(x) elif args.manifold_learner == 'LLE': from sklearn.manifold import LocallyLinearEmbedding hle = LocallyLinearEmbedding( n_components=args.umap_dim, n_neighbors=args.umap_neighbors).fit_transform(x) elif args.manifold_learner == 'tSNE': method = 'exact' hle = TSNE(n_components=args.umap_dim, n_jobs=16, random_state=0, verbose=0).fit_transform(x) elif args.manifold_learner == 'isomap': hle = Isomap( n_components=args.umap_dim, n_neighbors=5, ).fit_transform(x) gmm = mixture.GaussianMixture(covariance_type='full', n_components=args.n_clusters, random_state=0) gmm.fit(hle) y_pred_prob = gmm.predict_proba(hle) y_pred = y_pred_prob.argmax(1) acc = np.round(cluster_acc(y, y_pred), 5) nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) print(args.dataset + " | GMM clustering on " + str(args.manifold_learner) + " embedding") print("======================") result = "{}\t{}\t{}".format(ari, nmi, acc) print(result) print("======================") plt.scatter(*zip(*hle[:, :2]), c=y, label=y) plt.savefig(args.save_dir + '/' + args.dataset + '-' + str(args.manifold_learner) + '.png') plt.clf() y_pred = KMeans(n_clusters=args.n_clusters, random_state=0).fit_predict(hle) acc = np.round(cluster_acc(y, y_pred), 5) nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) print(args.dataset + " | K-Means " + str(args.manifold_learner) + " embedding") print("======================") result = "{}\t{}\t{}".format(ari, nmi, acc) print(result) print("======================") sc = SpectralClustering(n_clusters=args.n_clusters, random_state=0, affinity='nearest_neighbors') y_pred = sc.fit_predict(hle) acc = np.round(cluster_acc(y, y_pred), 5) nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) print(args.dataset + " | Spectral Clustering on " + str(args.manifold_learner) + " embedding") print("======================") result = "{}\t{}\t{}".format(ari, nmi, acc) print(result) print("======================")
def cluster_manifold_in_embedding(hl, y, n_clusters, save_dir, visualize): # find manifold on autoencoded embedding if args.manifold_learner == 'UMAP': md = float(args.umap_min_dist) hle = umap.UMAP(random_state=0, metric=args.umap_metric, n_components=args.umap_dim, n_neighbors=args.umap_neighbors, min_dist=md).fit_transform(hl) elif args.manifold_learner == 'LLE': hle = LocallyLinearEmbedding( n_components=args.umap_dim, n_neighbors=args.umap_neighbors).fit_transform(hl) elif args.manifold_learner == 'tSNE': hle = TSNE(n_components=args.umap_dim, n_jobs=16, random_state=0, verbose=0).fit_transform(hl) elif args.manifold_learner == 'isomap': hle = Isomap( n_components=args.umap_dim, n_neighbors=5, ).fit_transform(hl) # clustering on new manifold of autoencoded embedding if args.cluster == 'GMM': gmm = mixture.GaussianMixture(covariance_type='full', n_components=n_clusters, random_state=0) gmm.fit(hle) y_pred_prob = gmm.predict_proba(hle) y_pred = y_pred_prob.argmax(1) elif args.cluster == 'KM': km = KMeans(init='k-means++', n_clusters=n_clusters, random_state=0, n_init=20) y_pred = km.fit_predict(hle) elif args.cluster == 'SC': sc = SpectralClustering(n_clusters=n_clusters, random_state=0, affinity='nearest_neighbors') y_pred = sc.fit_predict(hle) y_pred = np.asarray(y_pred) y_pred = y_pred.reshape(len(y_pred), ) y = np.asarray(y) y = y.reshape(len(y), ) acc = np.round(cluster_acc(y, y_pred), 5) nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5) ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) print(args.dataset + " | " + args.manifold_learner + " on autoencoded embedding with " + args.cluster + " - N2D") print("======================") result = "{}\t{}\t{}".format(ari, nmi, acc) print(result) print("======================") if visualize: plt.scatter(*zip(*hle[:, :2]), c=y, label=y) plt.savefig(save_dir + '/' + args.dataset + '-n2d.png') plt.clf() return y_pred, acc, nmi, ari
for i in range(len(n4)): elem=n4[i] eigs_matrix[k][0],eigs_matrix[k][1]=elem[0],elem[1] cols.append(colores['Sparse']) k+=1''' ''' for i in range(len(n1)): elem=n1[i] eigs_matrix[k][0],eigs_matrix[k][1]=elem[0],elem[1] cols.append(colores['Complete']) k+=1 ''' embedding = umap.UMAP(n_neighbors=75, metric='canberra', n_epochs=1000, min_dist=0.01, repulsion_strength=10, negative_sample_rate=50, transform_queue_size=10) H = embedding.fit_transform(eigs_matrix) plt.scatter(H[:, 0], H[:, 1], c=cols, s=5) plt.show() """ nx.draw(G_coma[1], with_labels=True, font_weight='bold') plt.subplot(224) nx.draw(G_old[1], with_labels=True, font_weight='bold') plt.show() np.savetxt("coma_props.csv",coma_props,delimiter=",",fmt="%s") nx.degree_centrality(Graphs[0]) nx.eigenvector_centrality(Graphs[0])
# mnist dataset digits = datasets.load_digits() x_data = digits.data[0:100] y_d = digits.target[0:100] labels = (2, 3, 7) x_list = [] y_list = [] for i, j in zip(x_data, y_d): if j in labels: x_list.append(i) y_list.append(j) x_data = umap.UMAP(n_neighbors=20, n_components=10, min_dist=0.01, metric='correlation').fit_transform(x_list, y=y_list) parameters = [] sc = StandardScaler() sc.fit(x_data) x_data = sc.transform(x_data) # labels = random.sample(range(10), k=3) x_train, x_test, y_train, y_test = train_test_split(x_data, y_list, test_size=0.1, shuffle=False) dim = len(x_data[0]) theta_list = [] test = QVC(dim, dim, ["0" * dim, "1" * dim], 16384, 1, dim, max(y_d))
learning_rate=3.0, local_connectivity=1.0, metric=<function dist_eigs at 0x7f6295517b70>, metric_kwds=None, min_dist=0, n_components=2, n_epochs=30, n_neighbors=20, negative_sample_rate=5, random_state=None, repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical', target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0, transform_seed=42, verbose=True) """ #splitting data into train and test sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(data, cols, test_size=0.3) embedding = umap.UMAP(n_components=2, n_neighbors=25, spread=2, metric=dist_eigs, verbose=True, n_epochs=500) #H = embedding.fit_transform(data,y=cols) H = embedding.fit_transform(data, y=cols) #adding legend one = mpatches.Patch(facecolor=colores[category1], label=category1, linewidth=0.5, edgecolor='black') two = mpatches.Patch(facecolor=colores[category2], label=category2, linewidth=0.5, edgecolor='black') fig = plt.figure() plt.scatter(H[:, 0], H[:, 1], c=cols, s=5)
def UMAP(self, **kwargs): XT = self.data[self.columns_latent_states].values XT = StandardScaler().fit_transform(XT) XT = umap.UMAP(**kwargs).fit_transform(XT) self.data[[f'UMAP {i+1}' for i in range(XT.shape[1])]] = XT
repulsion_strength = hf['repulsion_strength'] repulsion_strength = (repulsion_strength[0][0]) negative_sample_rate = hf['negative_sample_rate'] negative_sample_rate = (negative_sample_rate[0][0]) transform_queue_size = hf['transform_queue_size'] transform_queue_size = (transform_queue_size[0][0]) target_n_neighbors = hf['target_n_neighbors'] target_n_neighbors = (target_n_neighbors[0][0]) target_weight = hf['target_weight'] target_weight = (target_weight[0][0]) transform_seed = hf['transform_seed'] transform_seed = (transform_seed[0][0]) metric = sys.argv[1] hf = h5py.File(os.path.join(script_path, 'D.mat'),'r') D = np.array(hf.get('D')); reducer = umap.UMAP(metric=metric,n_neighbors=n_neighbors,n_components=n_components,learning_rate=learning_rate,min_dist=min_dist,spread=spread,set_op_mix_ratio=set_op_mix_ratio,local_connectivity=local_connectivity,repulsion_strength=repulsion_strength,negative_sample_rate=negative_sample_rate,transform_queue_size=transform_queue_size,target_n_neighbors=target_n_neighbors,target_weight=target_weight,transform_seed=transform_seed) embedding = reducer.fit_transform(D) with h5py.File(os.path.join(script_path, 'data.h5'), 'w') as hf: hf.create_dataset('R', data=embedding)
plt.gcf().clear() #%%[markdown] # ## Clustering grid search # # Search for the best clustering hyperparameters from the # fitted w2v grid search, then plot the results #%% print('Reducing dimensionality of word2vec embeddings for clustering...') # Get the normalized word2vec embeddings w2v_gscv.best_estimator_.named_steps['w2v'].gensim_model.init_sims( replace=True) vectors = w2v_gscv.best_estimator_.named_steps['w2v'].gensim_model.wv.vectors # Reduce dimensionality to 3D using UMAP umapper = umap.UMAP(n_components=3) umap_vectors = umapper.fit_transform(vectors) #%% # Do the clustering print('Performing grid search for clustering...') clust_gscv = GridSearchCV( clust_pipe, CLUST_GRID, scoring={'sil': silhouette_scorer_cosine}, cv=3, refit='sil', error_score=0, return_train_score=False, n_jobs=-2, verbose=1
for sent in test_dataset: concept_scores = {} for i in range(5): concept_scores[i] = 0 words = nltk.word_tokenize(sent) for word in words: for k in concept_words.keys(): for tup in concept_words[k]: if tup[0] == word: concept_scores[k] += tup[1] break print(sent) print(concept_scores) #print(sorted(concept_scores.items(),key = lambda x:x[1], reverse=True)) import umap X_topics = lsa.fit_transform(X) embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics) plt.figure(figsize=(7, 5)) plt.scatter( embedding[:, 0], embedding[:, 1], c=dataset_tar, s=10, # size edgecolor='none') plt.show()
def main(fmat, fxyz, ftags, fcolor, colorscol, prefix, output, peratom, keepraw, scale, umap_d, pc1, pc2, projectatomic, plotatomic, adtext): """ Parameters ---------- fmat: Location of descriptor matrix file or name of the tags in ase xyz file. You can use gen_descriptors.py to compute it. fxyz: Location of xyz file for reading the properties. ftags: Location of tags for the first M samples. Plot the tags on the umap. fcolor: Location of a file or name of the tags in ase xyz file. It should contain properties for all samples (N floats) used to color the scatterplot' colorscol: The column number of the properties used for the coloring. Starts from 0. prefix: Filename prefix, default is ASAP output: The format for output files ([xyz], [matrix]). Default is xyz. peratom: Whether to output per atom t-SNE coordinates (True/False) keepraw: Whether to keep the high dimensional descriptor when output is an xyz file (True/False) scale: Scale the coordinates (True/False). Scaling highly recommanded. umap_d: Dimension of the embedded space. dim1: Plot the projection along which principle axes dim2: Plot the projection along which principle axes projectatomic: build the projection using the (big) atomic descriptor matrix plotatomic: Plot the PCA coordinates of all atomic environments (True/False) adtext: Whether to adjust the texts (True/False) Returns ------- """ foutput = prefix + "-pca-d" + str(umap_d) use_atomic_desc = (peratom or plotatomic or projectatomic) # try to read the xyz file if fxyz != 'none': asapxyz = ASAPXYZ(fxyz) desc, desc_atomic = asapxyz.get_descriptors(fmat, use_atomic_desc) if projectatomic: desc = desc_atomic.copy() else: asapxyz = None print( "Did not provide the xyz file. We can only output descriptor matrix." ) output = 'matrix' # we can also load the descriptor matrix from a standalone file if os.path.isfile(fmat[0]): try: desc = np.genfromtxt(fmat[0], dtype=float) print("loaded the descriptor matrix from file: ", fmat) except: raise ValueError('Cannot load the descriptor matrix from file') # sanity check if len(desc) == 0: raise ValueError( 'Please supply descriptor in a xyz file or a standlone descriptor matrix' ) print("shape of the descriptor matrix: ", np.shape(desc), "number of descriptors: ", np.shape(desc[0])) if ftags != 'none': tags = np.loadtxt(ftags, dtype="str")[:] ndict = len(tags) else: tags = [] # scale & center if scale: from sklearn.preprocessing import StandardScaler scaler = StandardScaler() print('Shape of descriptor matrix is {}'.format(desc.shape)) print(scaler.fit(desc)) desc = scaler.transform(desc) # normalizing the features # fit UMAP reducer = umap.UMAP() proj = reducer.fit_transform(desc) if peratom or plotatomic and not projectatomic: proj_atomic_all = reducer.transform(desc_atomic) # save if output == 'matrix': np.savetxt(foutput + ".coord", proj, fmt='%4.8f', header='low D coordinates of samples') if peratom: np.savetxt(foutput + "-atomic.coord", proj_atomic_all, fmt='%4.8f', header='low D coordinates of samples') if output == 'xyz': if os.path.isfile(foutput + ".xyz"): os.rename(foutput + ".xyz", "bck." + foutput + ".xyz") asapxyz.set_descriptors(proj, 'pca_coord') if peratom: asapxyz.set_atomic_descriptors(proj_atomic_all, 'pca_coord') # remove the raw descriptors if not keepraw: asapxyz.remove_descriptors(fmat) asapxyz.remove_atomic_descriptors(fmat) asapxyz.write(foutput) # color scheme plotcolor, plotcolor_peratom, colorlabel, colorscale = set_color_function( fcolor, asapxyz, colorscol, 0, (peratom or plotatomic), projectatomic) # make plot if plotatomic: outfile = 'UMAP_4_' + prefix + '-c-' + fcolor + '-plotatomic.png' else: outfile = 'UMAP_4_' + prefix + '-c-' + fcolor + '.png' fig_spec_dict = { 'outfile': outfile, 'show': False, 'title': None, 'xlabel': 'Principal Axis 1', 'ylabel': 'Principal Axis 2', 'xaxis': True, 'yaxis': True, 'remove_tick': False, 'rasterized': True, 'fontsize': 16, 'components': { "first_p": { "type": 'scatter', 'clabel': colorlabel }, "second_p": { "type": 'annotate', 'adtext': adtext } } } asap_plot = Plotters(fig_spec_dict) asap_plot.plot(proj[::-1, [pc1, pc2]], plotcolor[::-1], [], tags) if peratom or plotatomic and not projectatomic: asap_plot.plot(proj_atomic_all[::-1, [pc1, pc2]], plotcolor_peratom[::-1], [], []) plt.show()
normalized_pao1_core_numeric_df = pd.DataFrame( normalized_pao1_core_numeric, columns=pao1_core_numeric.columns, index=pao1_core_numeric.index, ) normalized_pa14_core_numeric = scaler.fit_transform(pa14_core_numeric) normalized_pa14_core_numeric_df = pd.DataFrame( normalized_pa14_core_numeric, columns=pa14_core_numeric.columns, index=pa14_core_numeric.index, ) # + # model_pao1 = pca.fit(normalized_pao1_expression_numeric_df) model_pao1 = umap.UMAP(random_state=123).fit(normalized_pao1_core_numeric_df) normalized_pao1_core_encoded = model_pao1.transform( normalized_pao1_core_numeric_df) normalized_pao1_core_encoded_df = pd.DataFrame( data=normalized_pao1_core_encoded, index=normalized_pao1_core_numeric_df.index, columns=["1", "2"], ) # Add back label normalized_pao1_core_encoded_df[["our label", "sra label"]] = pao1_core_label[[ "our label", "sra label" ]]
from sklearn.datasets import fetch_mldata from sklearn.decomposition import PCA import numpy as np import matplotlib.pyplot as plt import seaborn as sns # Dimension reduction and clustering libraries import umap import hdbscan import sklearn.cluster as cluster from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score sns.set(style='white', rc={'figure.figsize': (10, 8)}) mnist = fetch_mldata('MNIST Original') standard_embedding = umap.UMAP(random_state=42).fit_transform(mnist.data) plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=mnist.target, s=0.1, cmap='Spectral') kmeans_labels = cluster.KMeans(n_clusters=10).fit_predict(mnist.data) plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=kmeans_labels, s=0.1, cmap='Spectral') (adjusted_rand_score(mnist.target, kmeans_labels),
ax.imshow(images[i_img], cmap='gray', interpolation='none') if annosize is not None: # if: 画像列番と正解ラベルを追記 ax.annotate("%d" % i_img, xy=(0, 0.98), xycoords='axes fraction', ha='left', va='top', color='y', fontsize=annosize) ax.annotate("L:%d" % labels[i_img], xy=(1, 0.98), xycoords='axes fraction', ha='right', va='top', color='c', fontsize=annosize) ax.axis('off') plt.show() draw_digits(list(range(24))) import umap from scipy.sparse.csgraph import connected_components res_umap = umap.UMAP().fit_transform(digits.data) print(res_umap.shape) import matplotlib.cm as cm plt.figure(figsize=(6, 6)) plt.scatter(res_umap[:,0], res_umap[:,1], s=3, c=digits.target, cmap=cm.tab10) plt.colorbar() plt.show() plt.scatter(res_umap[:, 0], res_umap[:, 1], s=10, c=digits.target, cmap=cm.tab10) plt.axis([-3, 1, 3, 5]); plt.grid(); plt.show() i_list = np.where((-1.5 < res_umap[:, 0]) & (res_umap[:, 0] < -1) & (3.3 < res_umap[:, 1]) & (res_umap[:, 1] < 4))[0] i_list draw_digits(i_list)
correlated_genes = list(set(correlated_genes)) full_list += correlated_genes import collections full_freq = collections.Counter(full_list) full_list = [] for k, v in full_freq.items(): if v >= 3: full_list.append(k) full_list.sort() ##----------------------------------------------------------------------------- ##for clustering embedding = umap.UMAP(n_neighbors=5, min_dist=0.0, n_components=2, metric='cosine').fit_transform(new_y_pred) kmeans = KMeans(n_clusters=6, random_state=1).fit(embedding) y_label = kmeans.labels_.copy() ##for visualization embedding = umap.UMAP(n_neighbors=5, min_dist=0.0, n_components=2, metric='cosine').fit_transform(new_y_pred) embedding = pd.DataFrame(embedding) embedding.columns = ['UMAP1', 'UMAP2'] embedding["Proton"] = y_label f = sns.lmplot(x='UMAP1', y='UMAP2',
index.verbose = True faiss_index_file = 'faiss.index' if os.path.exists(faiss_index_file): print('load existing index from %s' % faiss_index_file) index = faiss.read_index(faiss_index_file, faiss.IO_FLAG_MMAP) index.hnsw.efSearch = 256 else: # build lossy faiss index print('build new index and save to %s' % faiss_index_file) index.hnsw.efConstruction = 40 data = np.ascontiguousarray(mnist.data, dtype=np.float32) # we no longer need mnist data in its original form print('train index...') index.train(data) print('add vectors to index...') index.add(data) print('save...') faiss.write_index(index, faiss_index_file) reducer = umap.UMAP(random_state=42, init="random", verbose=True, n_epochs=200) embedding = reducer.fit_faiss_transform(index) #embedding = reducer.fit_transform(mnist.data) fig, ax = plt.subplots(figsize=(12, 10)) color = mnist.target.astype(int) plt.scatter(embedding[:, 0], embedding[:, 1], c=color, cmap="Spectral", s=0.1) plt.setp(ax, xticks=[], yticks=[]) plt.title("MNIST data embedded into two dimensions by UMAP", fontsize=18) plt.show()
def evaluation(y_pred, cluster_method="Kmeans", num_cluster=25, n_neighbors=20, min_dist=0.0): ''' it supports Kmeans, Spectral clustering and GMM 3 clustering methods ''' if cluster_method == "Kmeans": embedding = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=num_cluster, metric="euclidean").fit_transform(y_pred) kmeans = KMeans(n_clusters=num_cluster, random_state=1).fit(embedding) centroid = kmeans.cluster_centers_.copy() y_label = kmeans.labels_.copy() y_pseudo = np.zeros((y_pred.shape[0], num_cluster)) elif cluster_method == "SC": embedding = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=num_cluster, metric="euclidean").fit_transform(y_pred) clustering = SpectralClustering(n_clusters=num_cluster, assign_labels="discretize", random_state=0).fit(embedding) y_label = clustering.labels_.copy() centroid = pd.DataFrame(embedding.copy()) centroid['label'] = y_label centroid = centroid.groupby('label').mean().values y_pseudo = np.zeros((y_pred.shape[0], num_cluster)) else: embedding = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=num_cluster, metric="euclidean").fit_transform(y_pred) gmm = GaussianMixture(n_components=num_cluster).fit(embedding) y_label = gmm.predict(embedding) centroid = pd.DataFrame(embedding.copy()) centroid['label'] = y_label centroid = centroid.groupby('label').mean().values y_pseudo = np.zeros((y_pred.shape[0], num_cluster)) ##alternative approach to assigne soft-assignment through t-student distribution ##t-student distribution kernel soft-assignment,alpha=1 #for j in range(centroid.shape[0]): # y_pseudo[:,j]=(np.linalg.norm(embedding-centroid[j,:],axis=1)+1)**(-1) ##cosine distance #y_pseudo[:,j]=((1-cosine_similarity(embedding,centroid[j,:].reshape(1,embedding.shape[1]))+1)**(-1))[:,0] #y_pseudo = pd.DataFrame(y_pseudo) #y_pseudo2=np.zeros((y_pred.shape[0],centroid.shape[0])) #for j in range(centroid.shape[0]): # y_pseudo2[:,j]=y_pseudo.iloc[:,j].values/np.sum( # y_pseudo[y_pseudo.columns.difference([j])].values,axis=1) #y_pseudo = y_pseudo2 ##soft-assignment used in this study ##distance based soft-assignment for j in range(centroid.shape[0]): ##euclidean distance y_pseudo[:, j] = 1 / np.linalg.norm(embedding - centroid[j, :], axis=1) ##cosine similarity #y_pseudo[:,j]=1/(1-cosine_similarity(embedding,centroid[j,:].reshape(1,embedding.shape[1])))[:,0] y_pseudo = softmax(y_pseudo, axis=1) ##auxiliary target distribution f = np.sum(np.square(y_pseudo) / np.sum(y_pseudo, axis=0), axis=1) y2 = np.square(y_pseudo) / np.sum(y_pseudo, axis=0) au_tar = (y2.T / f).T return au_tar, y_label, embedding
import umap import matplotlib.pyplot as plt #导入数据 datafile = u'data.csv' data = pd.read_csv(datafile) data_fea = data.iloc[:,1:]#取数据中指标所在的列 data_fea = data_fea.fillna(0)#填补缺失值 #标准化 data_mean = data_fea.mean() data_std = data_fea.std() data_fea = (data_fea - data_mean)/data_std #降维 umap_data = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=3).fit_transform(data_fea.values) #归一化 from sklearn import preprocessing min_max_scaler = preprocessing.MinMaxScaler() umap_data = min_max_scaler.fit_transform(umap_data) #绘制图像 plt.figure(figsize=(12,5)) plt.scatter(umap_data[:,0], umap_data[:,1]) plt.scatter(umap_data[:,1], umap_data[:,2]) plt.scatter(umap_data[:,2], umap_data[:,0])
def plot_embedding_of_points_2PLOTS(embedding, labels1, labels2, path_save1, path_save2, name_save1, name_save2, n_samples_plot=None, method='TSNE'): n_samples = embedding.shape[0] if n_samples_plot != None: indices_to_plot = np.random.choice(range(n_samples), min(n_samples_plot, n_samples), replace=False) else: indices_to_plot = np.random.choice(range(n_samples), n_samples, replace=False) embedding_sampled = embedding[indices_to_plot, :] if embedding.shape[1] == 2: embedding_sampled = embedding else: if method == 'TSNE': embedding_sampled = TSNE( n_components=2).fit_transform(embedding_sampled) elif method == 'UMAP': embedding_sampled = umap.UMAP( n_neighbors=500).fit_transform(embedding_sampled) labels1 = np.asarray(labels1) labels2 = np.asarray(labels2) labels1 = labels1.astype(int) labels2 = labels2.astype(int) labels1_sampled = labels1[indices_to_plot] labels2_sampled = labels2[indices_to_plot] #### plot1 (for classwise): _, ax = plt.subplots(1, figsize=(14, 10)) n_classes = FLAGS.num_classes class_names = [class_list[str(i)] for i in range(len(class_list))] plt.scatter(embedding_sampled[:, 0], embedding_sampled[:, 1], s=10, c=labels1_sampled, cmap='Spectral', alpha=1.0) cbar = plt.colorbar(boundaries=np.arange(FLAGS.num_classes + 1) - 0.5) cbar.set_ticks(np.arange(FLAGS.num_classes)) cbar.set_ticklabels(class_names) if not os.path.exists(path_save1): os.makedirs(path_save1) plt.savefig(path_save1 + name_save1 + '.png') plt.clf() plt.close() #### plot2 (for domainwise): _, ax = plt.subplots(1, figsize=(14, 10)) n_classes = len(domain_list) class_names = domain_list plt.scatter(embedding_sampled[:, 0], embedding_sampled[:, 1], s=10, c=labels2_sampled, cmap='Spectral', alpha=1.0) cbar = plt.colorbar(boundaries=np.arange(n_classes + 1) - 0.5) cbar.set_ticks(np.arange(n_classes)) cbar.set_ticklabels(class_names) if not os.path.exists(path_save2): os.makedirs(path_save2) plt.savefig(path_save2 + name_save2 + '.png') plt.clf() plt.close() np.save('embedding_sampled.npy', embedding_sampled) np.save('labels1_sampled.npy', labels1_sampled) np.save('labels2_sampled.npy', labels2_sampled) np.save('embedding.npy', embedding) np.save('labels1.npy', labels1) np.save('labels2.npy', labels2)