def extract_tsne_gather_feat(stage): """ Extract tsne gather features. Note: python2 only. Better than func:extract_tsne_feat in cv, but worst in submission. """ df_w2vlem_join = pd.read_csv('tmp2/df_w2vlem_join.csv', index_col=0) if stage <= 1: df_feat = pd.DataFrame(index=df_w2vlem_join.index.values) tfidf = TfidfVectorizer(ngram_range=(2,4), stop_words='english', min_df=2) df_w2vlem_join['t_w2v'].to_csv('tmp2/t_w2v', index=False) df_w2vlem_join['q_w2v'].to_csv('tmp2/q_w2v', index=False) df_w2vlem_join['d_w2v'].to_csv('tmp2/d_w2v', index=False) tfidf.set_params(input='filename') tfidf.fit(['tmp2/t_w2v','tmp2/q_w2v','tmp2/d_w2v']) tfidf.set_params(input='content') cPickle.dump(tfidf, open('tmp2/tfidf_obj','wb')) tfidf = cPickle.load(open('tmp2/tfidf_obj','rb')) X_t = tfidf.transform(df_w2vlem_join['t_w2v'].tolist()) if stage <= 2: svd = TruncatedSVD(n_components=100, random_state=2016) X_svd = svd.fit_transform(X_t) X_scaled = StandardScaler().fit_transform(X_svd) X_tsne = bh_sne(X_scaled) df_feat['tsne_t_1'] = X_tsne[:len(df_w2vlem_join), 0] df_feat['tsne_t_2'] = X_tsne[:len(df_w2vlem_join), 1] df_feat.to_csv('tmp2/tsne_t', index=False) df_feat = pd.read_csv('tmp2/tsne_t') if stage <= 3: print(df_feat) X_q = tfidf.transform(df_w2vlem_join['q_w2v'].tolist()) X_tq = sp.hstack([X_t, X_q]).tocsr() svd = TruncatedSVD(n_components=50, random_state=2016) X_svd = svd.fit_transform(X_tq) X_scaled = StandardScaler().fit_transform(X_svd) X_tsne = bh_sne(X_scaled) df_feat['tsne_qt_1'] = X_tsne[:len(df_w2vlem_join), 0] df_feat['tsne_qt_2'] = X_tsne[:len(df_w2vlem_join), 1] df_feat.to_csv('tmp2/tsne_qt', index=False) df_feat = pd.read_csv('tmp2/tsne_qt') if stage <= 4: print(df_feat) X_d = tfidf.transform(df_w2vlem_join['d_w2v'].tolist()) svd = TruncatedSVD(n_components=100, random_state=2016) X_svd = svd.fit_transform(X_d) X_scaled = StandardScaler().fit_transform(X_svd) X_tsne = bh_sne(X_scaled) df_feat['tsne_desc_1'] = X_tsne[:len(df_w2vlem_join), 0] df_feat['tsne_desc_2'] = X_tsne[:len(df_w2vlem_join), 1] df_tsne_feats = df_feat df_tsne_feats.to_csv('tmp2/df_tsne_gather_feats.csv')
def test_seed(): from tsne import bh_sne from sklearn.datasets import load_iris import numpy as np iris = load_iris() X = iris.data y = iris.target t1 = bh_sne(X, random_state=np.random.RandomState(0), copy_data=True) t2 = bh_sne(X, random_state=np.random.RandomState(0), copy_data=True) assert np.all(t1 == t2)
def meta_pca_sne(exID, experiment_folder): # put exID back plot_subfolder = experiment_folder + "/meta_pca" plot_data_directory = check_create_directory(plot_subfolder) filename = "{}/META".format(plot_data_directory) # mongo stuff dbClient = DatabaseClient() filteredResults = dbClient.query(exID) if filteredResults is None: print "No results" return filteredId = filteredResults[0]['_id'] experiment = dbClient.get(filteredId) list_of_coords = experiment['DATA']['TSNE_DATA'] np_list = np.asarray(list_of_coords) print "META shape: ", np_list.shape epochs = experiment['DATA']['EPOCH'] layers = experiment['DATA']['LAYER'] labels = [] no_samples = len(epochs) for i in range(no_samples): labels.append(epochs[i] + (layers[i] * 0.1)) # labels.append(epochs[i]) labels = np.asarray(labels) labels = labels[:500] np_list = np_list[:, :500] # print "LIST", np_list # print "list size:", np_list.shape perp = 10.0 no_data_shape = np_list.shape[0] if (((perp / 3.0) - 1.0) < no_data_shape): perp = (no_data_shape / 3.0) - 1.0 sne_co = bh_sne(np_list, perplexity=perp, theta=0.5) print "sne", sne_co.shape print "labels", labels plt.scatter(sne_co[:, 0], sne_co[:, 1], c=labels) plt.savefig(filename, dpi=120) plt.close() # plt.show() print "show" flat_coords = np.reshape(sne_co, (1, -1)) flat_coords = flat_coords.tolist()[0] experiment['DATA']['META'] = flat_coords updatedObject = dbClient.update(filteredId, experiment)
def getTsne(modelFile, outDir, NBOW2=True): pp = numpy.load(modelFile) wv = pp['Wemb'].copy() sklearn_pca = PCA(n_components=50) Y_sklearn = sklearn_pca.fit_transform(wv) Y_sklearn = numpy.asfarray( Y_sklearn, dtype='float' ) print "PCA transformation done ..." print "Waitig for t-SNE computation ..." reduced_vecs = bh_sne(Y_sklearn) with open(outDir + "/tsne", "w") as out: for i in range(len(reduced_vecs)): out.write(str(reduced_vecs[i,0]) + " " + str(reduced_vecs[i,1]) + "\n") out.close print "t-SNE written to file ..." if NBOW2: av = pp['AVs'].astype('float64').T[0] wts =[] for i in range(len(wv)): wt = sigmoid(numpy.dot(wv[i],av)) wts.append(wt) with open(outDir + "/wts", "w") as out: for i in range(len(wts)): out.write(str(wts[i]) + "\n") out.close
def fit_transform(self, X): """Perform both a fit and a transform on the input data Fit the data to the reduction algorithm, and transform the data to the reduced space. Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) dataframe to both fit and transform Returns ------- self : DataFrameReducerBase A fit and transformed instance of the object Raises ------ ValueError If the input is not a pandas DataFrame, will not perform the fit and transform """ from tsne import bh_sne self._check_dataframe(X) return pd.DataFrame(bh_sne(X), index=X.index)
def visualize(x_data, y_data, y_name): # convert image data to float64 matrix. float64 is need for bh_sne x_data = np.asarray(x_data).astype('float64') y_data = np.asarray(y_data).astype('int') y_name = np.asarray(y_name) x_data = x_data.reshape((x_data.shape[0], -1)) # perform t-SNE embedding vis_data = bh_sne(x_data) # plot the result vis_x = vis_data[:, 0] vis_y = vis_data[:, 1] fig, ax = plt.subplots() almost_black = '#262626' # set2 = brewer2mpl.get_map('Set3', 'qualitative', 10).mpl_colors set2 = plt.cm.Set3(np.linspace(0, 1, 10)) for class_i in range(10): idx = np.where(y_data == class_i)[0] # print(idx) color = set2[class_i] # print('label=%s' % y_name[y]) plt.scatter(vis_x[idx], vis_y[idx], label=y_name[class_i], alpha=0.9, edgecolor=almost_black, linewidth=0.15, facecolor=color)#s=0.5, cmap=plt.cm.get_cmap("jet", 10)) # plt.colorbar(ticks=range(10)) ax.legend(loc=1) ax.grid(True) plt.clim(-0.5, 9.5) plt.show()
def getTsne(modelFile, outDir, NBOW2=True): pp = numpy.load(modelFile) wv = pp['Wemb'].copy() sklearn_pca = PCA(n_components=50) Y_sklearn = sklearn_pca.fit_transform(wv) Y_sklearn = numpy.asfarray(Y_sklearn, dtype='float') print "PCA transformation done ..." print "Waitig for t-SNE computation ..." reduced_vecs = bh_sne(Y_sklearn) with open(outDir + "/tsne", "w") as out: for i in range(len(reduced_vecs)): out.write( str(reduced_vecs[i, 0]) + " " + str(reduced_vecs[i, 1]) + "\n") out.close print "t-SNE written to file ..." if NBOW2: av = pp['AVs'].astype('float64').T[0] wts = [] for i in range(len(wv)): wt = sigmoid(numpy.dot(wv[i], av)) wts.append(wt) with open(outDir + "/wts", "w") as out: for i in range(len(wts)): out.write(str(wts[i]) + "\n") out.close
def api_function(self, *params): if params[0]: self.t = params[0] self.t_root = os.path.join(self.data_root, self.t, "imgs") self.t_label = os.path.join(self.data_root, self.t, "label.txt") t_set = DigitImage(self.t_root, self.t_label, transform=transforms.Compose([ transforms.Resize((28, 28)), transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) self.train_loader = torch.utils.data.DataLoader(t_set, batch_size=self.batch_size, shuffle=True, **self.kwargs) self.generate_feature() output = np.load('train/output.npy').astype(np.float64) data = np.load('train/data.npy') target = np.load('train/target.npy') print('data shape: ', data.shape) print('target shape: ', target.shape) print('output shape: ', output.shape) output_2d = bh_sne(output) np.save('train/output_2d.npy', output_2d, allow_pickle=False) plt.rcParams['figure.figsize'] = 20, 20 plt.scatter(output_2d[:, 0], output_2d[:, 1], c=target * 10) plt.savefig(os.path.join(self.result_root, self.t, "tsne.png"), bbox_inches='tight') return dict( data=Image.open(os.path.join(self.result_root, self.t, "tsne.png")) )
def meta_pca_sne(exID, experiment_folder): # put exID back plot_subfolder = experiment_folder + "/meta_pca" plot_data_directory = check_create_directory(plot_subfolder) filename = "{}/META".format(plot_data_directory) # mongo stuff dbClient = DatabaseClient() filteredResults = dbClient.query(exID) if filteredResults is None: print "No results" return filteredId = filteredResults[0]['_id'] experiment = dbClient.get(filteredId) list_of_coords = experiment['DATA']['TSNE_DATA'] np_list = np.asarray(list_of_coords) print "META shape: ", np_list.shape epochs = experiment['DATA']['EPOCH'] layers = experiment['DATA']['LAYER'] labels = [] no_samples = len(epochs) for i in range(no_samples): labels.append(epochs[i] + (layers[i]*0.1)) # labels.append(epochs[i]) labels = np.asarray(labels) labels = labels[:500] np_list = np_list[:,:500] # print "LIST", np_list # print "list size:", np_list.shape perp = 10.0 no_data_shape = np_list.shape[0] if (((perp / 3.0) - 1.0) < no_data_shape): perp = (no_data_shape / 3.0) - 1.0 sne_co = bh_sne(np_list, perplexity=perp, theta=0.5) print "sne", sne_co.shape print "labels", labels plt.scatter(sne_co[:,0], sne_co[:,1], c=labels) plt.savefig(filename, dpi=120) plt.close() # plt.show() print "show" flat_coords = np.reshape(sne_co, (1,-1)) flat_coords = flat_coords.tolist()[0] experiment['DATA']['META'] = flat_coords updatedObject = dbClient.update(filteredId, experiment)
def t_sne(obj): p = parser() data_categories = {} label_categories = {} for d in obj: for c in p.categories_item(d): if c not in data_categories: data_categories[c] = [] label_categories[c] = [] data_categories[c].append(d[1:]) label_categories[c].append('g' if d[0] == 1 else 'r') print len(data_categories) for c in data_categories: print '------------------------' print '%s (%d)' % (c, len(data_categories[c])) print '------------------------' if len(data_categories[c]) > 100: t_sne(data_categories[c], label_categories[c]) else: print 'small dimensionality' arr = np.array(data_categories, dtype=np.float64) x2 = bh_sne(arr) plt.scatter(x2[:, 0], x2[:, 1], c=label_categories) plt.show()
def t_sne_vis(name, base_model, x_processed_images, random_state, labels): """ :param name: the name of the cnn model used to build features :param base_model: the model obj :param x_processed_images: the input images for our model :param random_state: for fixing the results :param labels: 0/1 classification labels :return: the graph of image distribution based on features extracted from the model and the t-sne features """ # convert data to images print('Converting data points to composite image') X_train = x_processed_images print('we got %d different images of shape %dx%d ' % (len(X_train), X_train.shape[1], X_train.shape[1])) print('build usefull features from the selected model') features = base_model.predict(X_train) x_data1 = np.asarray(features).astype('float64') x_data1 = x_data1.reshape((x_data1.shape[0], -1)) # perform t-SNE embedding print('performing t-sne reduction') vis_data = bh_sne(x_data1, random_state=random_state) # plot the result fig = plt.figure(figsize=(15, 15)) vis_x = vis_data[:, 0] vis_y = vis_data[:, 1] plt.scatter(vis_x, vis_y, c=labels, cmap=plt.cm.get_cmap("winter", 2)) plt.colorbar(ticks=range(2)) plt.clim(0, 1) plt.title(name) plt.grid() plt.show() fig.savefig('tsne_vis_' + name + '.png') return vis_data
def project_vectors(X_in, model='tsne', perp=10, n_components=2): if model == 'tsne': from tsne import bh_sne X_in = X_in.reshape((X_in.shape[0], -1)).astype('float64') if perp is not None: X_out = bh_sne(X_in, perplexity=perp) else: X_out = bh_sne(X_in) elif model == 'pca': pca = PCA(n_components=n_components, whiten=True) pca.fit(X_in) X_out = pca.transform(X_in) else: raise NotImplementedError return X_out
def do_tsne(): cell_accum, cell_status = files_to_cells(files) feats_accum = [] for i, (x, status) in enumerate(zip(cell_accum, cell_status)): feats_accum.append(features(x)) feats_accum = np.asarray(feats_accum) cell_status = np.asarray(cell_status) from tsne import bh_sne points = bh_sne(feats_accum.astype("float64")) for c, p in zip(category10, plts): mask = np.zeros_like(points[:, 0]) print p for pi in p: print pi, dd[pi] mask[cell_status == dd[pi]] = 1 print np.sum(mask) mask = (mask == 1) plt.plot(points[:, 0][mask], points[:, 1][mask], "o", c=c, lw=0, alpha=0.5) #plt.legend([" ".join(p) for p in plts]) plt.legend(["Healthy", "Septic", "Non-Septic"]) plt.show()
def tsne_fit_transform(data, perplexity=50.0, nsvd=30): if nsvd > 0: svd = TruncatedSVD(n_components=nsvd) data = svd.fit_transform(data) data = StandardScaler().fit_transform(data) data = bh_sne(data, perplexity=perplexity) return data
def test_seed(): import numpy as np from sklearn.datasets import load_iris from tsne import bh_sne iris = load_iris() X = iris.data # y = iris.target t1 = bh_sne(X, random_state=np.random.RandomState(0), copy_data=True) t2 = bh_sne(X, random_state=np.random.RandomState(0), copy_data=True) assert t1.shape[0] == 150 assert t1.shape[1] == 2 assert np.all(t1 == t2)
def do_bhsne(): return bh_sne( data=X, d=embed_dimensions, perplexity=perplexity, random_state=random_state, **method_kwargs, )
def tSNE(x_data): vis_data = bh_sne(x_data) # tsne embedding vis_x = vis_data[:, 0] vis_y = vis_data[:, 1] plt.scatter(vis_x, vis_y, c='black') plt.show() '''
def visualize(vecs): print "Got the vectors, now doing dimesnion reduction..." reduced = bh_sne(vecs) print "Reduction done, now plotting: " for i in range(len(reduced)): plt.plot(vecs[i,0], vecs[i,1], marker='o', markersize=8) plt.show()
def image_scatter(features, images, img_res, res=4000, cval=1.): """ Embeds images via tsne into a scatter plot. Parameters --------- features: numpy array Features to visualize images: list or numpy array Corresponding images to features. Expects float images from (0,1). img_res: float or int Resolution to embed images at res: float or int Size of embedding image in pixels cval: float or numpy array Background color value Returns ------ canvas: numpy array Image of visualization """ features = np.copy(features).astype('float64') images = [gray_to_color(image) for image in images] images = [min_resize(image, img_res) for image in images] max_width = max([image.shape[0] for image in images]) max_height = max([image.shape[1] for image in images]) f2d = bh_sne(features) xx = f2d[:, 0] yy = f2d[:, 1] x_min, x_max = xx.min(), xx.max() y_min, y_max = yy.min(), yy.max() # Fix the ratios sx = (x_max - x_min) sy = (y_max - y_min) if sx > sy: res_x = sx / float(sy) * res res_y = res else: res_x = res res_y = sy / float(sx) * res canvas = np.ones((res_x + max_width, res_y + max_height, 3)) * cval x_coords = np.linspace(x_min, x_max, res_x) y_coords = np.linspace(y_min, y_max, res_y) for x, y, image in zip(xx, yy, images): w, h = image.shape[:2] x_idx = np.argmin((x - x_coords)**2) y_idx = np.argmin((y - y_coords)**2) canvas[x_idx:x_idx + w, y_idx:y_idx + h] = image return canvas
def reduce_features_dim(word2vec_model): """ - Reduces features dimensionality using t-sne """ word_vectors = word2vec_model.syn0 word_vectors = word_vectors.astype('float64') return bh_sne(word_vectors)
def perform_tsne_transformation(X): ######### There is a bug in scikit-learn, hence cant do tsne with it. ############## # tsne_model = TSNE(n_components=2,random_state=0) # X_new = tsne_model.fit_transform(X) X = np.asarray(X).astype('float64') X = X.reshape((X.shape[0],-1)) X_new = bh_sne(X,perplexity=5) return X_new
def tsne(embedding, word_2_id, sample_size = 1000): embedding_2d = bh_sne(embedding.astype(np.float64)) keys = random.sample(word_2_id.keys(), sample_size) fig, ax = plt.subplots() for k in keys: id = word_2_id[k] ax.annotate(k, (embedding_2d[id, 0], embedding_2d[id, 1])) plt.show()
def perform_tsne_transformation(X): ######### There is a bug in scikit-learn, hence cant do tsne with it. ############## # tsne_model = TSNE(n_components=2,random_state=0) # X_new = tsne_model.fit_transform(X) X = np.asarray(X).astype('float64') X = X.reshape((X.shape[0], -1)) X_new = bh_sne(X, perplexity=5) return X_new
def extract_tsne_feat(): """ Extract tsne features. Note: python2 only. """ df_w2vlem_join = pd.read_csv('tmp2/df_w2vlem_join.csv', index_col=0) df_feat = pd.DataFrame(index=df_w2vlem_join.index.values) tfidf = TfidfVectorizer(ngram_range=(1,4), stop_words='english', min_df=2) X_t = tfidf.fit_transform(df_w2vlem_join['t_w2v'].tolist()) svd = TruncatedSVD(n_components=100, random_state=2016) X_svd = svd.fit_transform(X_t) X_scaled = StandardScaler().fit_transform(X_svd) X_tsne = bh_sne(X_scaled) df_feat['tsne_t_1'] = X_tsne[:len(df_w2vlem_join), 0] df_feat['tsne_t_2'] = X_tsne[:len(df_w2vlem_join), 1] df_feat.to_csv('tmp2/tsne_t', index=False) print(df_feat) tfidf = TfidfVectorizer(ngram_range=(1,4), stop_words='english', min_df=2) X_q = tfidf.fit_transform(df_w2vlem_join['q_w2v'].tolist()) X_tq = sp.hstack([X_t, X_q]).tocsr() svd = TruncatedSVD(n_components=100, random_state=2016) X_svd = svd.fit_transform(X_tq) X_scaled = StandardScaler().fit_transform(X_svd) X_tsne = bh_sne(X_scaled) df_feat['tsne_qt_1'] = X_tsne[:len(df_w2vlem_join), 0] df_feat['tsne_qt_2'] = X_tsne[:len(df_w2vlem_join), 1] df_feat.to_csv('tmp2/tsne_qt', index=False) df_feat = pd.read_csv('tmp2/tsne_qt') print(df_feat) tfidf = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2) X_d = tfidf.fit_transform(df_w2vlem_join['d_w2v'].tolist()) svd = TruncatedSVD(n_components=70, random_state=2016) X_svd = svd.fit_transform(X_d) X_scaled = StandardScaler().fit_transform(X_svd) X_tsne = bh_sne(X_scaled) df_feat['tsne_desc_1'] = X_tsne[:len(df_w2vlem_join), 0] df_feat['tsne_desc_2'] = X_tsne[:len(df_w2vlem_join), 1] df_tsne_feats = df_feat df_tsne_feats.to_csv('tmp2/df_tsne_feats.csv')
def plot_latent_space(X, y, file_name): """ This function employs TSNE to convert the latent space to 2D space and plots the result. """ X = tf.cast(X, dtype='float64') X = bh_sne(X) walking = X[y == 1] walking_up = X[y == 2] walking_down = X[y == 3] sitting = X[y == 4] standing = X[y == 5] laying = X[y == 6] colors = ['r', 'c', 'k', 'y', 'm', 'g'] plt.figure(figsize=(12, 10)) WALKING = plt.scatter(walking[:, 0], walking[:, 1], marker='x', color=colors[0], alpha=0.3) WALKING_UPSTAIRS = plt.scatter(walking_up[:, 0], walking_up[:, 1], marker='+', color=colors[1], alpha=0.3) WALKING_DOWNSTAIRS = plt.scatter(walking_down[:, 0], walking_down[:, 1], marker='^', color=colors[2], alpha=0.3) SITTING = plt.scatter(sitting[:, 0], sitting[:, 1], marker='o', color=colors[3], alpha=0.3) STANDING = plt.scatter(standing[:, 0], standing[:, 1], marker='o', color=colors[4], alpha=0.3) LAYING = plt.scatter(laying[:, 0], laying[:, 1], marker='o', color=colors[5], alpha=0.3) plt.legend((WALKING, WALKING_UPSTAIRS, WALKING_DOWNSTAIRS, SITTING, STANDING, LAYING), ('WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING'), scatterpoints=1, loc='lower left', ncol=3, fontsize=8) plt.savefig(file_name + '.png') plt.show()
def test_iris(): from tsne import bh_sne from sklearn.datasets import load_iris iris = load_iris() X = iris.data y = iris.target X_2d = bh_sne(X)
def tSNE(x_data, display_plot=False): vis_data = bh_sne(x_data) # tsne embedding vis_x = vis_data[:, 0] vis_y = vis_data[:, 1] if display_plot: plt.scatter(vis_x, vis_y, c='black') plt.show() '''
def collect_and_save_plot_information(main_df, reduced_dim=2, perplexity=30, \ n_kmeans_clusters=10): """ Definition Reduces Dimensionality of Voltage Traces via t-SNE. Then applies K-means clustering. Also creates information strings for each datapoint containing its metadata. Finally saves it all in a pickle, for the script "display_interactive_plot.py" to use. """ # dataframe for saving plotting values later on plot_df = pd.DataFrame(columns=["Value1", "Value2", "Color", "Label"]) # create a numpy array for voltage trace values to apply dim reduction # and clustering n_channels = len(main_df['Conc_Trace']) n_trace_values = len(main_df['Conc_Trace'][0]) data_array = np.zeros((n_channels, n_trace_values)) # copy values into numpy array from dataframe for row in range(n_channels): data_array[row] = main_df['Conc_Trace'][row] """ "we normalized each column by Z-scoring: we substracted its mean and then divided by its standard deviation" """ for column in range(n_trace_values): data_array[:,column] = (data_array[:,column] - \ np.mean(data_array[:,column])) / \ np.std(data_array[:,column]) # apply t-SNE on data data2d = bh_sne(data_array, d=reduced_dim, perplexity=perplexity) plot_df["Value1"] = data2d[:, 0] # ... and save in the dataframe plot_df["Value2"] = data2d[:, 1] # ... and save in the dataframe # apply kmeans and save its labels for colorization kmeans = KMeans(n_clusters=n_kmeans_clusters) kmeans.fit(data2d) labels = kmeans.labels_ # for assigning different colors to different clusters nr_to_color = ["b", "g", "r", "c", "y", "m", "k", "fuchsia", \ "gray", "navy", "coral"] # convert clusters to colors colors = [nr_to_color[cluster] for cluster in labels] plot_df["Color"] = colors # ... and save in the dataframe # create list of labels, which will be displayed when clicking # on a datapoint text_list = [create_label_for_matplotlib(main_df.loc[counter]) \ for counter in range(n_channels)] plot_df["Label"] = text_list # ... and save in the dataframe # now that we got all we need, save in pickle plot_df.to_pickle("Interactive_Plot_Values.pickle")
def main(datafile, normalize, ndims, copula, clusteroutput, subsample): X, features = read_sah_h5(datafile) I, all_features = read_sah_h5(datafile, just_good=False) if 'id' in all_features: ids = X[:, all_features.index('id')] else: ids = np.arange(len(X)).astype(int) Xorig = X if normalize: mean = np.average(X, axis=0) std = np.std(X, axis=0) std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs X = (X - mean) / std idx = np.random.randint(len(X), size=subsample) X = X[idx] ids = ids[idx] if copula: X = np.column_stack([copula_transform(x) for x in X.T]) # I added this for the time/freq clustering # to emphasize the frequency feature # X[:, 1] *= 1e-3 Y = bh_sne(X, d=ndims) dbscan = DBSCAN(eps=1.75, min_samples=5) C = dbscan.fit_predict(Y) tree = ExtraTreesClassifier(n_estimators=100) tree.fit(X, C) for f, i in zip(features, tree.feature_importances_): print '%s: %f' % (f, i) with open(clusteroutput, 'w+') as f: for c, i in zip(C, ids): f.write('%d,%d\n' % (i, c)) pl.scatter(Y[:, 0], Y[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C))) for c in np.unique(C): pl.bar(0, 0, lw=0, ec='none', fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c) pl.legend() pl.show()
def _tsne(X, dir_str="*.wav", perplexity=3, plotting=False): """ Utility function to compute tsne """ flist = sorted(glob.glob(dir_str)) Z = bh_sne(X, perplexity=perplexity) if plotting: figure() plot(Z[:,0], Z[:,1],'r.') [[text(p[0],p[1],'%s'%flist[i],fontsize=12) for i,p in enumerate(Z)]] return Z
def create_clusters_dbscan(): np.random.seed(seed=12509234) df = c.import_data('../data/planets.csv') # Extract columns cols_phys = [ 'pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl', 'pl_bmassj', 'pl_radj', 'pl_dens', 'st_dist', 'st_optmag', 'st_teff', 'st_mass', 'st_rad', 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp', 'pl_eqt', 'st_plx', 'st_age', 'st_vsini', 'st_acts' ] df_p = c.get_physical_columns(df, cols_phys) logcols = [ 'pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax', 'pl_radj', 'st_dist', 'st_rad', 'st_teff', 'st_dens', 'pl_rvamp', 'st_plx', 'st_vsini', 'st_acts' ] # Pre-process the data, apply lof to all columns km_labels, df_imputed = cl.kmeans_centroid_fill(df_p, 3, 10) # Create TSNE embedding vis_data_transit = bh_sne(df_imputed, perplexity=40) vis_x_transit = vis_data_transit[:, 0] vis_y_transit = vis_data_transit[:, 1] # Create a background plot of TSNE embedding fig = plt.figure(figsize=(12, 8)) plt.scatter(vis_y_transit, vis_x_transit, c=['blue'], cmap=plt.cm.get_cmap("jet", 10), alpha=0.2) plt.savefig("../data/QC010_TSNE_background.png") # DBSCAN clustering X = np.array([vis_x_transit, vis_y_transit]).T dbs = DBSCAN(eps=2.1, min_samples=12) dbs.fit(X) # Generate clustering plot from TSNE n_clusters = len(np.unique(dbs.labels_)) fig = plt.figure(figsize=(15, 12)) plt.scatter(vis_y_transit, vis_x_transit, c=dbs.labels_, cmap=plt.cm.get_cmap("jet", n_clusters), alpha=1.0, s=10 * dbs.labels_ + 1) plt.colorbar(ticks=range(n_clusters)) plt.clim(-0.5, n_clusters - 0.5) plt.savefig("../data/QC011_TSNE_clustering_w_sizes.png")
def image_scatter(features, images, img_res, res=4000, cval=1): """ Embeds images via tsne into a scatter plot. Parameters --------- features: numpy array Features to visualize images: list or numpy array Corresponding images to features. Expects float images from (0,1). img_res: float or int Resolution to embed images at res: float or int Size of embedding image in pixels cval: float or numpy array Background color value Returns ------ canvas: numpy array Image of visualization """ features = np.copy(features).astype('float64') #change type images = [gray_to_color(image) for image in images] # convert to grey scale #images = [min_resize(image, img_res) for image in images] max_width = max([image.shape[0] for image in images]) #max_height = max([image.shape[1] for image in images]) f2d = bh_sne(features) # docs: https://github.com/danielfrg/tsne # alternative: http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE xx = f2d[:, 0] yy = f2d[:, 1] x_min, x_max = xx.min(), xx.max() y_min, y_max = yy.min(), yy.max() # Fix the ratios sx = (x_max - x_min) sy = (y_max - y_min) if sx > sy: res_x = sx / float(sy) * res res_y = res else: res_x = res res_y = sy / float(sx) * res canvas = np.ones( (int(res_x + max_width), int(res_y + max_width), 3)) * cval x_coords = np.linspace(x_min, x_max, res_x) y_coords = np.linspace(y_min, y_max, res_y) for x, y, image in zip(xx, yy, images): #w, h = img_res x_idx = np.argmin((x - x_coords)**2) y_idx = np.argmin((y - y_coords)**2) canvas[x_idx:x_idx + 70, y_idx:y_idx + 70] = image return canvas
def make_multiple_cl_tsne(mat, cmap_left=None, cmap_right=None, skl_version=True, random_state=0, learning_rate=40): from matplotlib import pyplot as plt import numpy as np # the matrix needs to be transposed in order to cluster the numbers x_data = mat.transpose() # convert image data to float64 matrix. float64 is need for bh_sne x_data = np.asarray(x_data).astype('float64') if skl_version == False: from tsne import bh_sne # perform t-SNE embedding, lowered perplexity vis_data = bh_sne(x_data, perplexity=7) vis_x = vis_data[:, 0] vis_y = vis_data[:, 1] else: from sklearn import manifold # run tsne from sklearn ########################### tsne = manifold.TSNE(perplexity=7, n_iter=100000, random_state=random_state, method='exact', metric='correlation', learning_rate=learning_rate, verbose=0, n_iter_without_progress=1000, init='random', early_exaggeration=4) Y = tsne.fit_transform(x_data) vis_x = Y[:, 0] vis_y = Y[:, 1] fig, axarr = plt.subplots(ncols=2, figsize=(10, 5)) marker_size = 150 # always require cmap axarr[0].scatter(vis_x, vis_y, c=cmap_left, \ cmap=plt.cm.get_cmap('prism',len(cmap_left)), s=marker_size) axarr[1].scatter(vis_x, vis_y, c=cmap_right, \ cmap=plt.cm.get_cmap('jet',len(cmap_right)), s=marker_size) plt.show()
def plot_tsne(X_sample, y_sample): from tsne import bh_sne vis_data = bh_sne(X_sample) vis_x = vis_data[:, 0] vis_y = vis_data[:, 1] plt.scatter(vis_x, vis_y, c=np.argmax(y_sample, 1), cmap=plt.cm.get_cmap("jet", 10)) plt.colorbar(ticks=range(10)) plt.clim(-0.5, 9.5) plt.show()
def _fit_transform(self, x_in): """ fit to data, and return the transform Args: x (numpy.array): Input numpy array Returns: x (numpy.array): Transformed array """ x_in = x_in.astype(float) res = _tsne.bh_sne(x_in, perplexity=self.perplexity, theta=self.theta) return res
def run_tsne(transformed_pca_matrix, name='TSNE', key='TSNE', tsne_dims=None, input_pcs=None, perplexity=None, theta=None, max_iter=None, stop_lying_iter=None, mom_switch_iter=None, copy_data=False, random_state=None): if tsne_dims is None: tsne_dims = analysis_constants.TSNE_N_COMPONENTS if perplexity is None: perplexity = analysis_constants.TSNE_DEFAULT_PERPLEXITY if theta is None: theta = analysis_constants.TSNE_THETA if random_state is None: random_state = analysis_constants.RANDOM_STATE if max_iter is None: max_iter = analysis_constants.TSNE_MAX_ITER if stop_lying_iter is None: stop_lying_iter = analysis_constants.TSNE_STOP_LYING_ITER if mom_switch_iter is None: mom_switch_iter = analysis_constants.TSNE_MOM_SWITCH_ITER if input_pcs is not None: transformed_pca_matrix = transformed_pca_matrix[:, :input_pcs] # Make sure perplexity satisfies 'tsne' requirements N = transformed_pca_matrix.shape[0] perplexity = min(perplexity, max(1, -1 + float((N - 1)) / 3)) transformed_tsne_matrix = tsne_bh.bh_sne( transformed_pca_matrix, d=tsne_dims, theta=theta, perplexity=perplexity, max_iter=max_iter, stop_lying_iter=stop_lying_iter, mom_switch_iter=mom_switch_iter, copy_data=copy_data, random_state=np.random.RandomState(random_state)) return TSNE(transformed_tsne_matrix, name=name, key=key)
def run_BH_tSNE(table, do_pca=True): pca_dimensions = 50 perplexity = 30.0 logger.info("run_BH_tSNE: Running k-mer based binning...") # Note - currently doesn't handle cases where PCA dimensions and perplexity set too high # We make a submatrix, consisting of the contigs in the table k_mer_counts_submatrix = list() for i,row in table.iterrows(): contig = row['contig'] k_mer_counts_submatrix.append(k_mer_dict[contig]) normalized_k_mer_submatrix = normalizeKmers(k_mer_counts_submatrix) # PCA if (len(normalized_k_mer_submatrix) > pca_dimensions) and (do_pca == True): logger.info('run_BH_tSNE: Principal component analysis') pca = decomposition.PCA(n_components=pca_dimensions) pca_matrix = pca.fit_transform(normalized_k_mer_submatrix) else: logger.info('run_BH_tSNE: Principle component analysis step skipped') # BH-tSNE logger.info('run_BH_tSNE: BH-tSNE') # Adjust perplexity according to the number of data points # Took logic from tsne source code if (len(normalized_k_mer_submatrix) - 1) < (3 * perplexity) : perplexity = (float(len(normalized_k_mer_submatrix) - 1) / 3) - 1 logger.info(str(len(normalized_k_mer_submatrix)) + ' data points') logger.info(str(len(normalized_k_mer_submatrix[0])) + ' dimensions') if (len(normalized_k_mer_submatrix) > pca_dimensions) and (do_pca == True): X = np.array(pca_matrix) else: X = np.array(normalized_k_mer_submatrix) bh_tsne_matrix = bh_sne(X, d=2, perplexity=perplexity, theta=0.5) # We will add bh_tsne_x and bh_tsne_y columns to the contig table bh_tsne_x = list() bh_tsne_y = list() for i in range(0, len(bh_tsne_matrix)): bh_tsne_x.append(bh_tsne_matrix[i][0]) bh_tsne_y.append(bh_tsne_matrix[i][1]) table['bh_tsne_x'] = pd.Series(bh_tsne_x, index = table.index) table['bh_tsne_y'] = pd.Series(bh_tsne_y, index = table.index)
def main(): # parse arguments parser = argparse.ArgumentParser( description='Preprocess data using t-SNE.') parser.add_argument('input_data', type=str, help='Path to numpy data file') parser.add_argument('input_images_list', type=str, help='Path to text file listing images') parser.add_argument('--input_images_dir', type=str, default='public/data/', help='Path to images folder') parser.add_argument('--output_file', type=str, default='public/data.csv', help='Path to output CSV file') parser.add_argument('--max_num_points', type=int, default=1000, help='Max number of points') args = parser.parse_args() # load data data_load = np.load(args.input_data) with open(args.input_images_list, 'r') as f: image_names_load = [l.strip() for l in f] # shuffle and reduce number of data points to run faster # this also results in cleaner visualization assert len(image_names_load) > 0 assert data_load.shape[0] == len(image_names_load) indices = range(data_load.shape[0]) random.shuffle(indices) data = np.zeros((args.max_num_points, data_load.shape[1])) image_names = [] for i, rand_index in enumerate(indices): if i >= args.max_num_points: break data[i, :] = data_load[rand_index, :] image_names.append(image_names_load[rand_index]) assert data.shape[0] == len(image_names), '{0} and {1}'.format( data.shape[0], len(image_names)) # run dimensionality reduction with t-SNE data_tsne = bh_sne(data) xs = data_tsne[:, 0] ys = data_tsne[:, 1] # save to csv file save_to_csv(args.output_file, image_names, xs, ys)
def dim_reduction(main_df, reduced_dim=2, perplexity=30, n_kmeans_clusters=10): # create a numpy array for voltage trace values n_channels = len(main_df['Conc_Trace']) n_trace_values = len(main_df['Conc_Trace'][0]) data_array = np.zeros((n_channels, n_trace_values)) # copy values for row in range(n_channels): data_array[row] = main_df['Conc_Trace'][row] """ "we normalized each column by Z-scoring: we substracted its mean and then divided by its standard deviation" """ for column in range(n_trace_values): data_array[:,column] = (data_array[:,column] - \ np.mean(data_array[:,column])) / \ np.std(data_array[:,column]) # apply t-SNE on data data2d = bh_sne(data_array, d=reduced_dim, perplexity=perplexity) # apply kmeans and save its labels for colorization kmeans = KMeans(n_clusters=n_kmeans_clusters) kmeans.fit(data2d) labels = kmeans.labels_ # create list of labels, which will be displayed when clicking # on a datapoint text_list = [create_label_for_matplotlib(main_df.loc[counter]) \ for counter in range(n_channels)] # plot plt.figure("Interactive Plot", figsize=(20, 10)) plt.title("Interactive Plot of Channels, click on Datapoints for Info") # for assigning different colors to different clusters nr_to_color = ["b", "g", "r", "c", "y", "m", "k", "fuchsia", \ "gray", "navy", "coral"] # plot every datapoint with its corresponding text ("label") # dont iterative, to assign labels correctly for dp in range(n_channels): plt.scatter(data2d[dp,0], data2d[dp,1], linewidths=0.1, \ c=nr_to_color[labels[dp]], label=text_list[dp]) # add datacursor, basically enabling "clickability" of datapoints datacursor(formatter='{label}'.format, bbox=dict(fc='white'), \ arrowprops=dict(arrowstyle='simple', fc='black', alpha=0.5)) plt.show() return 0
def visualize_tsne(): """ play around with tsne to visualize image space """ import matplotlib.pyplot as plt from tsne import bh_sne tracker_df = pd.read_pickle('./tracker.pkl') dfs = [] for category in listdir('/Volumes/micro/recommend-a-graham/imgs/'): for user in listdir('/Volumes/micro/recommend-a-graham/imgs/'+category): img_ids = listdir('/Volumes/micro/recommend-a-graham/imgs/{}/{}/'.format(category, user)) sub_df = tracker_df[tracker_df.img_id.apply(lambda x: x in img_ids)] # user_df = pd.read_pickle('../fc8_pkls/fc8_{}.pkl'.format(user)) user_df = pd.read_pickle('../fc7_pkls/fc7_{}.pkl'.format(user)) user_df = user_df[user_df.shortcode.apply(lambda x: x in sub_df.shortcode.values)] dfs.append(pd.merge(sub_df, user_df, on='shortcode')) dfs = pd.concat(dfs, axis=0) dfs.reset_index(inplace=True) # dfs.fc8 = dfs.fc8.apply(lambda x: x.reshape(1, x.shape[0])) dfs.fc7 = dfs.fc7.apply(lambda x: x.reshape(1, x.shape[0])) # vectors = dfs.fc8.values vectors = dfs.fc7.values x_data = vectors[0] for vector in vectors[1:]: x_data = np.concatenate((x_data, vector), axis=0) print x_data.shape y_dict = {k:i for i,k in enumerate(dfs.username.unique())} # y_dict = {k:i for i,k in enumerate(['cats', 'dogs', 'foodies', # 'models','most_popular', # 'photographers', 'travel'])} y_data = dfs.username.apply(lambda x: y_dict[x]).values vis_data = bh_sne(x_data) vis_x = vis_data[:,0] vis_y = vis_data[:,1] plt.scatter(vis_x, vis_y, c=y_data, cmap=plt.cm.get_cmap("jet", 28)) cbar = plt.colorbar() cbar.set_ticks([i*29./28 + 29./56 for i in range(28)]) # cbar.set_ticklabels(y_dict.keys()) cbar.set_ticklabels(zip(dfs.username.unique(), [user_cat_dict[i] for i in dfs.username.unique()])) plt.clim(0, 29) plt.title('tsne, fc7, 100img_per_user, 4user_per_categ') plt.show()
def run(self): config = Config.get() # Create the embedding. featureDict = Utils.read_features(config.getSample("ExternalFiles", "vecs_with_id"), id_set=getSampleIds()) keys = list(featureDict.keys()) vectors = np.array([featureDict[vID]["vector"] for vID in keys]) out = bh_sne(vectors, pca_d=None, theta=config.getfloat("PreprocessingConstants", "tsne_theta")) X, Y = list(out[:, 0]), list(out[:, 1]) Utils.write_tsv(config.getSample("ExternalFiles", "article_embedding"), ("index", "x", "y"), keys, X, Y)
def extract_w2v_tsne_feat(): """ Extract w2v tsne features. Note: python2 only. Worst in cv, so do not use this. """ df_w2v_feats = pd.read_csv('tmp2/df_w2v_feats.csv', index_col=0) X = df_w2v_feats.values df_feat = pd.DataFrame(index=df_w2v_feats.index.values) X_scaled = StandardScaler().fit_transform(X) X_tsne = bh_sne(X_scaled) df_feat['tsne_t_1'] = X_tsne[:len(df_w2v_feats), 0] df_feat['tsne_t_2'] = X_tsne[:len(df_w2v_feats), 1] df_feat.to_csv('tmp2/df_tsne_w2v_feats.csv')
def make_sample_df(labels, np, labeled_data, limit, algorithm_name, dims, cores): used_labels = np.unique(labels)[0:3] label_dfs = [] for label in used_labels: subset = labeled_data[labeled_data[:,0] == label,1:] # select all those elements with this label # sub-sample the stratified subset num_samples = min(limit,subset.shape[0]) indices = np.arange(subset.shape[0]) np.random.shuffle(indices) sampled_pts = subset[indices[:num_samples],:] data_2d = bh_sne(sampled_pts) num_records = data_2d.shape[0] label_dfs.append(pd.DataFrame({"X": data_2d[:,0], "Y": data_2d[:,1], "dimension": [dims for i in range(num_records)], "label": [label_dict[label] for i in range(num_records)], "algorithm": [algorithm_name for i in range(num_records)]})) return label_dfs
def _fit_transform(self, x_in): """ fit to data, and return the transform Args: x (numpy.array): Input numpy array Returns: x (numpy.array): Transformed array """ x_in = x_in.astype(float) res = _tsne.bh_sne( x_in, perplexity=self.perplexity, theta=self.theta ) return res
def read_sne_video(): my_data = np.genfromtxt('./test_data.csv', delimiter=',') # test data is labels = np.genfromtxt('./test_labels.csv', delimiter=',') print "data incoming shape", my_data.shape # getting X, y and labels - also trims the NaNs # labels = my_labels[:,0] # keeping the data in 2D format # should trim the third column data = my_data[:,:-1] X_2d = bh_sne(data, perplexity=19.0, theta=0.5) makeVideo(X_2d, labels)
def tsne_pca(): # mongo stuff dbClient = DatabaseClient() filteredResults = dbClient.query() if filteredResults is None: print "No results" return filteredId = filteredResults[4]['_id'] experiment = dbClient.get(filteredId) list_of_coords = experiment['DATA']['TSNE_DATA'] pca_list = [] for coords in list_of_coords: np_val = np.asarray(coords) coords_array = np.reshape(coords, (-1,2)) cast = castPCA2(coords_array) print "cast: ", cast.shape cast_veri = varimax(cast) print "cast_veri", cast_veri.shape pca_list.append(cast_veri) np_pca = np.asarray(pca_list) print "pca: ", np_pca.shape np_pca = np.reshape(np_pca, (6,-1)) print "pca: ", np_pca.shape labels = np.asarray([1,2,3,4,5,6]) print "SNEPCA BH" sne_pca = bh_sne(np_pca, perplexity=1.0, theta=0.5) plt.scatter(sne_pca[:,0], sne_pca[:,1], c=labels) plt.show() flat_coords = np.reshape(sne_pca, (1,-1)) flat_coords = flat_coords.tolist()[0] experiment['DATA']['PCA'] = flat_coords updatedObject = dbClient.update(filteredId, experiment)
def process_files(in_file, out_file): """ Read data from in_file, and output to out_file """ sys.stderr.write('# in_file = %s, out_file = %s\n' % (in_file, out_file)) # input sys.stderr.write('# Input from %s.\n' % (in_file)) inf = codecs.open(in_file, 'r', 'utf-8') # output sys.stderr.write('Output to %s\n' % out_file) check_dir(out_file) ouf = codecs.open(out_file, 'w', 'utf-8') line_id = 0 words = [] embs = [] num_dim = -1 all_lines = inf.readlines() num_words = len(all_lines) sys.stderr.write('# Processing file %s ...\n' % (in_file)) sys.stderr.write('# num words = %d\n' % (num_words)) for line in all_lines: line = clean_line(line) tokens = re.split('\s+', line) word = tokens[0] if line_id==0: num_dim = len(tokens)-1 sys.stderr.write('# num dims = %d\n' % (num_dim)) X = np.zeros((num_words, num_dim)) emb = np.array(tokens[1:], dtype='|S4') emb = emb.astype(np.float) X[line_id, :] = emb line_id = line_id + 1 if (line_id % 10000 == 0): sys.stderr.write(' (%d) ' % line_id) sys.stderr.write('Done! Num lines = %d\n' % line_id) X_2d = bh_sne(X) for ii in xrange(num_words): ouf.write('%f %f\n' % (X_2d[ii, 0], X_2d[ii, 1])) inf.close() ouf.close()
def pca_tsne(): my_data = np.genfromtxt('../data/CSV/appended_data.csv', delimiter=',') labels = np.genfromtxt('../data/CSV/appended_data_labels.csv', delimiter=',') print "data incoming shape", my_data.shape print "labels", labels # getting X, y and labels - also trims the NaNs print "shape lab", labels.shape labels = labels[0:60] # labels = my_labels[:,0] # keeping the data in 2D format # should trim the third column # X_2d = my_data[:,:-1] X_2d = my_data new = X_2d[0:10000] print "new ", new.shape pca = PCA(n_components = new.shape[1]) new = pca.fit_transform(new) new = np.reshape(new, (1,-1)) print "XR: ", new.shape for t in range(60): if (t != 0): X_ = X_2d[t*10000:t*10000+10000] print "X_: ", X_.shape pca = PCA(n_components = X_.shape[1]) X_r = pca.fit_transform(X_) print "XR: ", X_r.shape X_r = np.reshape(X_r, (1,-1)) new = np.append(new, X_r, axis=0) print "new: ", new.shape print new.shape X_bn = bh_sne(new[:1000], perplexity=5.0, theta=0.5) data0 = X_bn.shape[0] data1 = X_bn.shape[1] # # plot & save plot_save(X_bn, labels, data0, data1, "bn_sne")
def get_tsne_mapping(materials_list=None): if materials_list is None: # Doesn't call get_materials_list() when module is loaded materials_list = get_materials_list() try: _log.info('Trying data cache for t-SNE mapping') with open('tsne_points.pickle') as f: _log.info('Using pickled t-SNE points') return pickle.load(f) except IOError: X = vectorize_random(4)(materials_list) X_2d = bh_sne(X) _log.info('t-SNE plot at {}'.format(plot_tsne(X_2d))) point_map = [{'pt': pt, 'material': m} for pt, m in zip(X_2d, materials_list)] with open('tsne_points.pickle', 'w') as f: pickle.dump(point_map, f) return point_map
def mds(dataSet): # Load all feature columns. rows = featureColumns(dataSet).transpose() sampledRows = rows[np.random.randint(len(rows), size=sampleSize)] print sampledRows print "Begin TSNE." projection = bh_sne(sampledRows, perplexity=5) #perplexity=math.sqrt(len(sampledRows))) print "End TSNE." #model = TSNE(n_components=2, random_state=0) #projection = model.fit_transform(sampledRows) # Save intermediate MDS. np.save(valuesFile, sampledRows) np.save(projectionFile, projection)
def make_multiple_cl_tsne(mat, cmap_left=None, cmap_right=None, skl_version=True, random_state=0, learning_rate=40): from matplotlib import pyplot as plt import numpy as np # the matrix needs to be transposed in order to cluster the numbers x_data = mat.transpose() # convert image data to float64 matrix. float64 is need for bh_sne x_data = np.asarray(x_data).astype('float64') if skl_version == False: from tsne import bh_sne # perform t-SNE embedding, lowered perplexity vis_data = bh_sne(x_data, perplexity=7) vis_x = vis_data[:, 0] vis_y = vis_data[:, 1] else: from sklearn import manifold # run tsne from sklearn ########################### tsne = manifold.TSNE(perplexity=7, n_iter=100000, random_state = random_state, method='exact', metric='correlation', learning_rate=learning_rate, verbose=0, n_iter_without_progress=1000, init='random', early_exaggeration=4) Y = tsne.fit_transform(x_data) vis_x = Y[:, 0] vis_y = Y[:, 1] fig, axarr = plt.subplots(ncols=2, figsize=(10,5)) marker_size = 150 # always require cmap axarr[0].scatter(vis_x, vis_y, c=cmap_left, \ cmap=plt.cm.get_cmap('prism',len(cmap_left)), s=marker_size) axarr[1].scatter(vis_x, vis_y, c=cmap_right, \ cmap=plt.cm.get_cmap('jet',len(cmap_right)), s=marker_size) plt.show()
def plot_bn_sne(data, labels, size): print "data[0]: ", data.shape print "labels[0]: ", labels.shape # trim the data & labels down to reasonable size data = data[0:size] labels = labels[0:size] # sizes data0 = data.shape[0] data1 = data.shape[1] # dimensionality reduction with bn_sne X_2d = bh_sne(data, perplexity=19.0, theta=0.5) print "plot shape: ", X_2d.shape # plot & save plot_save(X_2d, labels, data0, data1, "bn-SNE")
def meta_pca_sne(): # put exID back # mongo stuff dbClient = DatabaseClient() filteredResults = dbClient.query() if filteredResults is None: print "No results" return filteredId = filteredResults[0]['_id'] experiment = dbClient.get(filteredId) list_of_coords = experiment['DATA']['TSNE_DATA'] np_list = np.asarray(list_of_coords) N, X = np_list.shape print "NX", N, X print "L0T", type(np_list[0]) print "L0TI", type(np_list[0][0]) print "L0S", np_list[0].shape print "L0", np_list[0] print "L1", np_list[1] # labels = np.asarray([1,2,3,4,5,6]) np_list = np_list[:,:500] # print "LIST", np_list # print "list size:", np_list.shape print "META BH" sne_co = bh_sne(np_list, perplexity=1.0, theta=0.5) # plt.scatter(sne_co[:,0], sne_co[:,1], c=labels) # plt.show() flat_coords = np.reshape(sne_co, (1,-1)) flat_coords = flat_coords.tolist()[0] experiment['DATA']['META'] = flat_coords updatedObject = dbClient.update(filteredId, experiment)
def tsne_visualize(L, labels, outfile='figs/tsne.jpg', perplexity=1): """Visualize L using t-sne, which is a little complicated to setup on your system""" if not tsne_installed: print 'Sorry, tsne is not installed' return # Use t-sne algorithm to come up with points on 2d plane points = bh_sne(L, perplexity=perplexity) points = np.array(points) data_x, data_y = points[:,0], points[:,1] fig = plt.figure() fig.suptitle('Word vectors T-SNE '+str(perplexity)) annotate(data_x, data_y, labels) # save file if outfile is not None: fig.savefig(outfile) plt.show()
def tag_article_tsne_plot(self, sample_size = 20000): samples = numpy.random.randint(0,1000000,sample_size ) self.combined_matrix = numpy.concatenate((self.doc_vec[samples, ],self.tag_vec), axis =0) self.combined_2d = bh_sne(self.combined_matrix) font = { 'fontname':'Tahoma', 'fontsize':0.1, 'verticalalignment': 'top', 'horizontalalignment':'center' } pylab.subplots_adjust(bottom =0.1) pylab.scatter(self.combined_2d[:sample_size+1,0], self.combined_2d[:sample_size+1,1], marker = '.' ,cmap = pylab.get_cmap('Spectral')) pylab.scatter(self.combined_2d[sample_size+1:,0], self.combined_2d[sample_size+1:,1], marker ='x' , cmap =pylab.get_cmap('Spectral')) pylab.title('NYT Articles and Labels(1991-2007)') pylab.xlabel('X') pylab.ylabel('Y') for label, x, y in zip(self.tags, self.combined_2d[sample_size+1:, 0], self.combined_2d[sample_size+1:, 1]): pylab.annotate( label, xy = (x, y), xytext = None, ha = 'right', va = 'bottom', **font) #,textcoords = 'offset points',bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), #arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) pylab.savefig('/mnt/data/tag_article_plot', bbox_inches ='tight', dpi = 1000, orientation = 'landscape', papertype = 'a0') pylab.close()
def reduce_dimensions(matrix, reduction_type, n_components): """ Reduces the dimensionality of a matrix and returns it. :param matrix: The matrix to reduce. :param reduction_type: The style of reduction to carry out. :param n_components: The number of components to allow. :return: A matrix whose dimensionality has been reduced. """ reduced_matrix = None if reduction_type is ReductionTypes.PCA: model = PCA(n_components=n_components, whiten=False) reduced_matrix = model.fit_transform(matrix) elif reduction_type is ReductionTypes.sPCA: model = SparsePCA(n_components=n_components, alpha=.5) reduced_matrix = model.fit_transform(matrix) elif reduction_type is ReductionTypes.T_SNE: reduced_matrix = bh_sne(matrix.transpose()) elif reduction_type is ReductionTypes.NMF: model = ProjectedGradientNMF(n_components=n_components, init='nndsvd', random_state=0) reduced_matrix = model.fit_transform(matrix) return reduced_matrix