def tsne_reduce(self, data): """ Use Barnes-Hut approximation of t-SNE to reduce dimensionality of features from N-dim to 2D for plotting. """ map2d = bhtsne.run_bh_tsne(data, no_dims=self.opts.n_dims, max_iter=self.opts.n_iters, use_pca=False, verbose=True) return map2d
def run_tsne(features_file, colors_file, output_prefix , filter_sample=[] , filter_cluster=[] , lst=[] , draw_per = 1.0 , iter = 1000 , perplexity = 50): # read data data_df = pd.read_table(features_file, header=None) cluster_colors = pd.read_table(colors_file, header=None) print(data_df.head()) # make dataframe pretty cluster_colors = cluster_colors.rename(columns={1:'color'}) cluster_colors["color"] = [int(extract_num.findall(str(x))[0]) for x in cluster_colors["color"].tolist()] print(cluster_colors.head()) #cluster_colors = cluster_colors.rename(columns={0:0}) # filter by samples if len(filter_sample) > 0: filter1 = [] for x in cluster_colors[0].tolist(): for it in filter_sample: st = "sample" + it + "-" if x.startswith(st): filter1.append(x) cluster_colors = cluster_colors[cluster_colors[0].isin(filter1)] # filter by percent if draw_per < 1: clusters = divide_by_cluster(cluster_colors[0].tolist(), cluster_colors["color"].tolist()) filter2 = take_first_per(clusters, lst) s = set(filter2) lst_new = [] for n in lst: for x in cluster_colors[0].tolist(): if x.startswith(n): print x lst_new.append(x) if x not in s: filter2.append(x) lst = lst_new cluster_colors = cluster_colors[cluster_colors[0].isin(filter2)] # merge data mapped = pd.merge(cluster_colors, data_df, on=0) # filter by length mapped["length"] = [int(x.split("_")[3]) for x in mapped[0].tolist()] mapped = mapped[mapped["length"] > 2000] print(mapped) # normalize like in CONCOCT data = mapped.as_matrix(columns=mapped.columns[2:-1]) v = (1.0/mapped["length"]).as_matrix()[:, np.newaxis] data = data + v along_Y = np.apply_along_axis(sum, 0, data) data = data/along_Y[None, :] along_X = np.apply_along_axis(sum, 1, data) data = data/along_X[:, None] data = np.log(data) #print(data) embedding_array = bhtsne.run_bh_tsne(data, initial_dims=data.shape[1], perplexity=perplexity, max_iter=iter) mapped["x"] = embedding_array[:, 0] mapped["y"] = embedding_array[:, 1] # draw result of TSNE on scatter plot pp = PdfPages(output_prefix) # filter clusters to show fc = filter_cluster if len(fc) > 0: filtered = mapped[mapped["color"].isin(fc)] #mapped = filtered else: filtered = mapped fig = pyplot.figure() # draw scatter plot color = mapped["color"].tolist() mx_color = max(color) pyplot.scatter(mapped["x"].tolist(), mapped["y"].tolist(), c=[cm.spectral(float(i) /mx_color) for i in color]) # make a legend for specific clusters # find cluster centers x = filtered["x"].tolist() y = filtered["y"].tolist() mp = divide_by_color(x, y, filtered["color"].tolist()) points, names = find_cluster_centers(mp) patches = [] dcolors = list(set(color)) for c in dcolors: if c in fc and len(fc) < 5: patches.append(mpatches.Patch(color=cm.spectral(float(c)/mx_color), label='C-'+ str(c))) pyplot.legend(handles=patches) draw_points(points, names, fig) # mark specific points filtered = mapped[mapped[0].isin(lst)] pyplot.scatter(filtered["x"].tolist(), filtered["y"].tolist(), marker="p", edgecolors='black', c=[cm.spectral(float(i) /mx_color) for i in filtered["color"].tolist()]) pyplot.title('Perp = '+ str(perplexity)+ ' Iter = ' + str(iter)) pp.savefig() pp.close()
if maxima: print("Maxima! All the matrix to bhtsne...") Xmax_to_bhtsne = X_TFIDF.toarray() ndms = 2 tht = 0.5 rndsd = 42 vrbs = True prplxt_range = [30, 50] for prplxt in prplxt_range: print('bhtsne params: perplexity %d and n_dims %d' % (prplxt, ndms)) if reducto: X_embedded = np.asarray( list( bhtsne.run_bh_tsne(Xred_to_bhtsne, no_dims=ndms, perplexity=prplxt, theta=tht, randseed=rndsd, verbose=vrbs))) print("Plotting Reducto...") if kmers: file_fig = str( kmersize ) + "-kmersize_kmers_Reducto_Reg_True_length_" + str( prplxt) + "-per.png" else: file_fig = str( kmersize ) + "-kmersize_newtoken_Reducto_Reg_True_length_" + str( prplxt) + "-per.png" file_fig = os.path.join("./", file_fig) y_c = []
import numpy as np import pylab import sys from datetime import datetime sys.path.append('./wrapper') import bhtsne print('Loading data...') data = np.loadtxt("./data/mnist2500_X.txt") labels = np.loadtxt("./data/mnist2500_labels.txt") figfile = './results/bhtsne_demo.png' print('Processing...') start = datetime.now() embedding_array = bhtsne.run_bh_tsne(data, initial_dims=data.shape[1]) end = datetime.now() elapsedTime = (end - start).total_seconds() print('Process time ', elapsedTime, 's') pylab.scatter(embedding_array[:, 0], embedding_array[:, 1], 20, labels) pylab.savefig(figfile) pylab.show()
print(pc_data.shape) # DROPOUT ## sums = np.sum(pc_data**2, axis=1) ## print(min(sums)) ## pc_data = pc_data[sums > .1, :] ## print(pc_data.shape) # Y_file = 'bh_' + infile + '_out.txt' # Y_samples = np.loadtxt(Y_file) # DEBUG PRESENCE OF INITIAL SAMPLES # weights = [1.0, 2.0, 3.0] for i, p in enumerate(perps): embedded, betas, orig_densities, emb_densities = bhtsne.run_bh_tsne( pc_data, initial_dims=pc_data.shape[1], theta=0.3, verbose=True, perplexity=p, max_iter=1000, use_pca=False) print embedded.shape, betas.shape np.savetxt(file_root.format(infile, 'out', p), embedded) np.savetxt(file_root.format(infile, 'betas', p), betas) np.savetxt(file_root.format(infile, 'marg_origD', p), orig_densities) np.savetxt(file_root.format(infile, 'marg_embD', p), emb_densities)
print("Clusterring with t-SNE") '''Perform Barnes-hut t-SNE approximation on the encoded inputs first input is an NxD array, where N is the number of samples no_dims => Number of dimensions to reduce the data to. initial_dims => number of principle components to extract with PCA perplexity => 2^(shannon entropy). A fair dice with k sides has a perplexity of k. This is much faster and has a similar accuracy to the standard t-SNE PCA can be used to reduce the dimensionality before performing the clustering This speeds up the computation of pairwise distances and supresses some of the noise ''' #Call the python wrapper for the c++ implementation clusters, positions = run_bh_tsne(X, no_dims=2, perplexity=perplexity, verbose=True, initial_dims=50, use_pca=False, max_iter=max_iter) print("position shape", positions[0].shape) X_iter = np.dstack(position.reshape(-1, 2) for position in positions) f, ax, sc, txts = scatter(X_iter[..., -1], y) def make_frame_mpl(t): i = int(t * 40) x = X_iter[..., i] sc.set_offsets(x) for j, txt in zip(range(10), txts):
""" # query = """ # select vals.cartodb_id,geom.the_geom, # geom.the_geom_webmercator, # vals.total_pop, # vals.bachelors_degree, # vals.associates_degree, # black_pop, # white_pop, # asian_pop, # median_income # from obs_fcd4e4f5610f6764973ef8c0c215b2e80bec8963 as geom, # obs_b393b5b88c6adda634b2071a8005b03c551b609a as vals # where geom.geoid=vals.geoid # limit 100 # """ api_key = '893a45cc8505dfffe26d94b3c160a6fc1b1da459' user = '******' data = app.getCartoData(query, user, api_key) print(data) print(data.shape) result = bhtsne.run_bh_tsne(data.as_matrix(), initial_dims=data.shape[1], verbose=True, perplexity=25) # result = app.calcTSNE(data) data.assign(x=result[:, 0], y=result[:, 1]).to_csv('temp.csv', index=False)
# -- END #check whether output file already exists user_input = 'n' if os.path.isfile(out_file): print('Output file already exists.') user_input = input('Do you want to replace file (y/n): ') else: user_input = 'y' if user_input == 'y': #load input data f = h5py.File(inputfile, 'r') #r - read only modedata = np.array(f['stackedmodes']) f.close() print('Size of data: ' + str(modedata.shape)) #perform tsne algorithm #modedata=modedata.astype('float64') space = bhtsne.run_bh_tsne(modedata, verbose=True, perplexity=perplexity, initial_dims=5 * 10, max_iter=5000) #Save the result f = h5py.File(out_file, 'w') tsne_s = f.create_dataset('space', (space.shape)) tsne_s[...] = space f.close()
(x_train, x_test), (y_train, y_test) = mnist.load_fashion_mnist_data( False, len_sample=70000, train_test_split=argp.freeze_index) # now, embed the train data: # load default initial gaussian embedding #_initial_embedding = get_initial_embedding(data_name=x_train, method_name="gaussian", i=1) if exact: """ pass an additional theta=0.0 if running exact tSNE """ # use initial embedding bh_tsne_dict = bhtsne.run_bh_tsne(x_train, verbose=True, initial_solution=None, theta=0.0) # save results # timestamp timestamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") bhtsne.write_bh_tsne_result( bh_tsne_dict, os.path.join(RESULT_DIR, algorithm_dir, FREEZE_INITIAL_DIR), "-", timestamp) else: # use initial embedding bh_tsne_dict = bhtsne.run_bh_tsne(x_train, verbose=True, initial_solution=None)
initialDims = 21 perplexity = 30 theta = .5 alg = 'svd' def load_data(data_set_name, data_set_type): dataBasePath = '/home/dev/data/numer.ai/' + data_set_name + '/' return np.load(dataBasePath + data_set_type + '-' + data_set_name + '.npy') data_set_name = '2016-09-08' X = load_data(data_set_name, 'features') Y = load_data(data_set_name, 'labels') X = np.reshape(X, (-1, 21)) Y = np.reshape(Y, (-1)) print(np.shape(X)) print(np.shape(Y)) # no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=False, # initial_dims=50, use_pca=True, max_iter=1000, map = run_bh_tsne(X[0:10000], no_dims=numDims, initial_dims=initialDims, verbose=True, perplexity=perplexity, theta=theta, usefile=False, array=X[0:10000]) # gscatter(map(:,1), map(:,2), Y); plt.scatter(map[:, 0], map[:, 1], 20, ('b', 'g')) plt.show() plt.savefig('bhtsne.png')
sparse_data = sparse.csr_matrix((pc_data, pc_indices, pc_indptr), pc_shape) print sparse_data.shape tsvd = TruncatedSVD(n_components = 20) transformed = tsvd.fit_transform(sparse_data) else: data = np.log(1+pc_data_npz['X']) print pc_data_npz['genes'].shape, data.shape tsvd = TruncatedSVD(n_components = 20) transformed = tsvd.fit_transform(data) N,D = transformed.shape if sub: sub_sz = int(subsample*N) indices = np.random.choice(N, sub_sz, replace=False) transformed = transformed[indices,:] np.savetxt(pcafile + '.txt', transformed) embedded, betas = bhtsne.run_bh_tsne(transformed, initial_dims=transformed.shape[1], theta=0.3, verbose=True, perplexity=50, max_iter=1000, use_pca=False) print embedded.shape, betas.shape np.savetxt(outfile , embedded) np.savetxt(betafile, betas)
def cluster_fTSNE(dataset, low_filter=0.3, no_dims=2, perplexity=50, use_pca=True, initial_dims=50, max_iter=500, theta=0.5, randseed=-1, verbose=False): import numpy as np import sys import os import gc if type(dataset) == str and dataset == "help": print( "This function is used to do single-nonsingle patterns clustering using TSNE and kmeans" ) print(" -> Input: dataset (numpy.ndarray, shape=(Nd,Nx,Ny)") print( " option: low_filter (float 0~1, the percent of area at the frequency center that is used for clustering, default=0.3)" ) print( " option (TSNE): no_dims (+int, dimensions after decomposition, default=2)" ) print( " option (TSNE): perplexity (+int, perlexity value to evaluate P(i|j) in TSNE, default=50)" ) print( " option (TSNE): use_pca (bool, whether to use PCA to generate initiate features, default=True)" ) print( " option (TSNE): initial_dims (+int, output dimensions of inititate PCA, ignored if use_pca=False, default=50)" ) print( " option (TSNE): max_iter (+int, max iterations, default=1000, suggested >500)" ) print( " option (TSNE): theta (0~1 float, the speed vs accuracy trade-off parameter, theta=1 means highest speed, default=0.5)" ) print( " option (TSNE): randseed (int, >=0 use 'randseed' as initiate value's generating seed, <0 use current as random seed, default=-1)" ) print(" option (TSNE): verbose (default=False)") print( " -> Return: list, [data_after_decomposition, predicted_labels]" ) print( "[Notice] The input dataset is not recommended to contain more than 5k patterns, but it's also neccessary to have more than 500 ones.\ You can split the original dataset into several parts and use multi-processors to deal with them." ) print("Help End. Exit.") return sys.path.append(__file__.split("/image/classify.py")[0] + '/analyse') sys.path.append(os.path.join(os.path.dirname(__file__), 'bhtsne_source')) import saxs import radp no_dims = int(no_dims) initial_dims = int(initial_dims) max_iter = int(max_iter) theta = min(np.abs(theta), 1) rcenter = [ int(dataset.shape[1] * low_filter / 2.0), int(dataset.shape[2] * low_filter / 2.0) ] # fft print("\nStart FFT analysis ...") dataset[np.where(dataset < 0)] = 0 dataset[np.isnan(dataset)] = 0 dataset[np.isinf(dataset)] = 0 fdataset = np.zeros(dataset.shape) for ind, data in enumerate(dataset): fdataset[ind] = np.abs(np.fft.fftshift(np.fft.fft2(data))) sys.stdout.write("Processing " + str(ind) + "/" + str(len(dataset)) + " ...\r") sys.stdout.flush() print("\nDone.") # normalization print("\nStart normalization ...") center_data = (fdataset.shape[1] / 2, fdataset.shape[2] / 2) fdataset = fdataset[:, center_data[0] - rcenter[0]:center_data[0] + rcenter[0], center_data[1] - rcenter[1]:center_data[1] + rcenter[1]] center_data = (fdataset.shape[1] / 2.0, fdataset.shape[2] / 2.0) saxs_data = saxs.cal_saxs(fdataset) saxs_intens = radp.radial_profile_2d(saxs_data, center_data) dataset_norm = np.zeros(fdataset.shape) for ind, pat in enumerate(fdataset): pat_normed = radp.radp_norm_2d(saxs_intens, pat, center_data) dataset_norm[ind] = pat_normed sys.stdout.write("Processing " + str(ind) + "/" + str(len(fdataset)) + " ...\r") sys.stdout.flush() print("\nDone.") # decomposition print("\nStart decomposition using TSNE ...") dataset_norm.shape = (dataset_norm.shape[0], dataset_norm.shape[1] * dataset_norm.shape[2]) log_data_norm = np.log(1 + np.abs(dataset_norm)) del dataset_norm del fdataset del saxs_data gc.collect() import bhtsne embedding_array = bhtsne.run_bh_tsne(log_data_norm, no_dims=no_dims, perplexity=perplexity, use_pca=use_pca, initial_dims=initial_dims, max_iter=max_iter, theta=theta, randseed=randseed, verbose=verbose) # clustering print("\nStart clustering ...") from sklearn import cluster centroid, label, inertia = cluster.k_means(embedding_array, 2) return embedding_array, label
import numpy as np import bhtsne from argparse import ArgumentParser, FileType from os.path import abspath, dirname, isfile, join as path_join from shutil import rmtree from struct import calcsize, pack, unpack from subprocess import Popen from sys import stderr, stdin, stdout from tempfile import mkdtemp from platform import system from os import devnull import numpy as np import os, sys import io mnist_path = '/Users/jiadao/PycharmProjects/Py3/data visulization/tsne/bhtsne-master/mnist2500_X.txt' data = np.loadtxt(mnist_path, skiprows=1) print('FINISHED LOADING') embedding_array = bhtsne.run_bh_tsne(data)
for i in range(len(h5filelist)): f=h5py.File(inputfolder + '/' + h5filelist[i],'r') #r - read only loadedthetas=np.array(f['thetas']) f.close() #matrix with models x 16 entries #reshape attaches [0,0,:],[0,1,:],... usethetas=np.reshape(loadedthetas[:,1:5,:], (loadedthetas.shape[0],16,1))[:,:,0] entrynumber[i]=usethetas.shape[0]; #number of linear models in file if i==0: allthetas=np.copy(usethetas) else: #stitches all together allthetas=np.vstack((allthetas, usethetas)) print('Size of data: '+str(allthetas.shape)) #perform tsne algorithm allthetas=allthetas.astype('float64') pca_dim=14 space=bhtsne.run_bh_tsne(allthetas,verbose=True,perplexity=perplexity, initial_dims=pca_dim,max_iter=5000) #Save the result f=h5py.File(out_file,'w') entrynumbers=f.create_dataset('entrynumber',(entrynumber.shape)) entrynumbers[...]=entrynumber alltheta=f.create_dataset('allthetas',(allthetas.shape)) alltheta[...]=allthetas tsne_s=f.create_dataset('space',(len(allthetas),2),maxshape=(None,None)) tsne_s.resize(space.shape) tsne_s[...]=space f.close()
trainer.close() # Draw the pic # Normalize weights /= np.sqrt(np.sum(weights**2, axis=1, keepdims=True)) embeddings_val /= np.sqrt(np.sum(embeddings_val**2, axis=1, keepdims=True)) # We only get the weights we need index2center = OrderedDict() for i in range(num_samples): if labels_val[i] not in index2center: index2center[labels_val[i]] = weights[labels_val[i], :] weights_new = [] weights_index = [] for index in index2center: weights_index.append(index) weights_new.append(index2center[index]) weights_new = np.stack(weights_new, axis=0) num_weights = len(weights_index) # tSNE combined = np.concatenate([weights_new, embeddings_val], axis=0) Y = run_bh_tsne(combined, no_dims=2, initial_dims=50) Y_weights = Y[:num_weights, :] Y_embeddings = Y[num_weights:, :] plt.figure(1) plt.scatter(Y_embeddings[:, 0], Y_embeddings[:, 1], c=labels_val) plt.scatter(Y_weights[:, 0], Y_weights[:, 1], marker="x") plt.savefig(args.embedding_pic)
def tsne_workflow(parameter_name, value_list, data, result_base_dir, data_result_subdirectory, initial_embedding_method=None, **kwargs): """ :param parameter_name: :param value_list: :param data: :param result_base_dir: :param data_result_subdirectory: :param initial_embedding_method: :return: """ for value in value_list: print("###########################################") print("## Start t-SNE ##") print("###########################################") print("Using Dataset: {}".format(data_result_subdirectory)) print("Tuning parameter: " + parameter_name + ", value: " + str(value)) # 5 times to validate for random methods, once for specified inputs max_round = 6 if initial_embedding_method in ['gaussian', 'random' ] else 2 for i in range(1, max_round): print("###", "### Round:" + str(i), "###") # create directory if non-existent result_dir = os.path.join(result_base_dir, str(value), data_result_subdirectory, str(i)) try: os.makedirs(result_dir) except FileExistsError: # directory already exists pass # load the initial embedding if specified _initial_embedding = None if initial_embedding_method is not None: _initial_embedding = get_initial_embedding( data_name=data_result_subdirectory, method_name=initial_embedding_method, i=i) filename = "initial_solution_" + data_result_subdirectory + "_" + initial_embedding_method \ + "{}" + ".pickle" filename = filename.format("_" + str(i) if initial_embedding_method in ['random', 'gaussian'] else "") print("Using initial embedding file: {}".format(filename)) # run t-SNE # perform PCA to 50 dims beforehand # use initial embedding bh_tsne_dict = bhtsne.run_bh_tsne( data, verbose=True, initial_solution=_initial_embedding, **{parameter_name: value}, **kwargs) # save results # timestamp timestamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") bhtsne.write_bh_tsne_result(bh_tsne_dict, result_dir, "-", timestamp)
import numpy as np import bhtsne np.random.seed(1137) X = np.random.rand(500, 10) proj, betas, cpp, cpi = bhtsne.run_bh_tsne(X, verbose=False, randseed=1137, return_betas=True, return_cost_per_point=True, return_cost_per_iter=True) assert betas.shape[0] == 500 assert cpp.shape[0] == 500 assert cpi.shape[0] == 1000 assert proj[0, 0] == -7.235696435669544
import numpy as np import bhtsne from sklearn.decomposition import PCA # data = np.loadtxt('../example_data/pollen.txt',delimiter=',').T # data = np.log(1+data) # data = data - np.mean(data, axis=1, keepdims=True) # data = data/(np.sum(data**2, axis=1, keepdims=True))**.5 # pca = PCA(n_components=50) # pc_data = pca.fit_transform(data) pc_data = np.loadtxt('gaussian_density_overlap.txt').T print(pc_data.shape) embedded = bhtsne.run_bh_tsne(pc_data, initial_dims=pc_data.shape[1], theta=0., verbose=True, perplexity=30) np.savetxt('orig_overlap_out.txt', embedded)
def multi_run_wrapper(args): projectionsAllLoc, betasL, cppL, cpiL = bhtsne.run_bh_tsne(*args) return projectionsAllLoc, betasL, cppL, cpiL
plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k for tsne_data') plt.savefig("figures/elbowmethodt.png") #plt.show() ## Cluster dimension-reduced data with KMeans, with n_cluster equal to k K = 2 km = KMeans(n_clusters=K, init='k-means++', n_init=100) km.fit(train) x = km.fit_predict(train) embedding_array = bhtsne.run_bh_tsne(train, no_dims=2, perplexity=4, initial_dims=train.shape[1], verbose=True) tsne_data = pd.DataFrame(embedding_array[0:, 0:, ]) # plot tsne_dat tsne_data["cluster"] = x tsne_data = tsne_data.sort_values('cluster') print("tsne-data") print(tsne_data) color_list = [] for cluster in tsne_data['cluster']: if cluster == 0: color_list.append('red') if cluster == 1: color_list.append('green')