def generate_vectors(json_input_filename, w2v_dim, perplexity, theta, pca_dims, dim=2): vectors = [] most_dominant_labels = [] image_ids = [] label_map = utils.load_json(json_input_filename, w2v_dim) for image_id, label in label_map.iteritems(): label_vectors = [] label_scores = [] label_desc = [] for val in label: label_vectors.append(val['word2vec']) label_scores.append(val['score']) label_desc.append(str(''.join(c for c in val['description'] if c in string.printable))) output_vec = word2vec.linear_combination_vectors(vectors=label_vectors, coefficients=label_scores) vectors.append(output_vec) most_dominant_labels.append(label_desc[0]) image_ids.append(image_id) embeddings = [] for result in bh_tsne(vectors, perplexity=perplexity, initial_dims=pca_dims, theta=theta, no_dims=dim): embeddings.append(result) embeddings = utils.scale_max_abs(embeddings) return embeddings, most_dominant_labels, image_ids
def main(): docs = get_text() # list of READMEs and descriptions docs_preprocess = map(lambda doc: preprocess(doc), docs) # stemming string tfidf_matrix = tfidf_vectorizer( docs_preprocess) # convert to tf-idf matrix svd_vect = svd_vectorizer(tfidf_matrix, n_components=200, n_iter=150) # reduce dimensions # Run t-distributed Stochastic Neighbor Embedding (t-SNE; Barnes-Hut implementation) # Timings: sklearn - 1k: 15.9195120335, 2k - 41.7645118237, 4k - 185.737361908 # t-sne - 1k: 15.7083182335, 2k - 38.8270409107, 4k - 78.0439789295 embedded = [] for res in bh_tsne(svd_vect, no_dims=2, perplexity=40, verbose=True): embedded.append(res) embedded = np.array(embedded) # We can use this as input to identify clusters of projects # Plot t-SNE fig, ax = plt.subplots(figsize=(10, 10)) plt.setp(ax, xticks=(), yticks=()) fig.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9, wspace=0.0, hspace=0.0) ax.scatter(embedded[:, 0], embedded[:, 1], marker='x') # c=newsgroups.target, marker="x") # fig.savefig('tsne.pdf', format = 'pdf') plt.show()
def run_bhtsne(data_set, theta=0.5, perplexity=50): """ Runs the bh-tsne on the given data :type data_set: numpy array :param data_set: Numpy array on which bh-tsne shall be run :type theta: float :param theta: Specifies the theta parameter :type perplexity: int :param perplexity: Specifies the perplexity """ n = data_set.shape[0] print('Running Barnes-Hut - t-SNE on %d data points...' % n) data_bhtsne = np.zeros((n, 2)) for dat, temp in zip( bh_tsne(np.copy(data_set), theta=theta, perplexity=perplexity), data_bhtsne): temp[...] = dat print('\nNormalizing...') min = np.min(data_bhtsne, axis=0) data_bhtsne = data_bhtsne - min max = np.max(data_bhtsne, axis=0) data_bhtsne = data_bhtsne / max return data_bhtsne
def LDA_run(): global g_lda, g_vec, g_coords, g_clust articles = load_data_folder(text_dir) #articles = {key:bag_of_wordify(articles[key]) for key in articles} lda_keys = list(articles.keys()) corpus = [bag_of_wordify(articles[key]) for key in articles] wdict = corpora.Dictionary(corpus) bow_corpus = [wdict.doc2bow(text) for text in corpus] tfidf = models.tfidfmodel.TfidfModel(bow_corpus, normalize=True) tfidf_corpus = [tfidf[doc] for doc in bow_corpus] NUM_TOPICS = 100 printl( "Training LDA model") lda_model = LDA_train(wdict, articles, tfidf_corpus, NUM_TOPICS) print ("Converting to vector representation") wordvec = LDA2Vec(lda_model, tfidf_corpus) g_vec = wordvec printl ("running tsne") coords= [coord for coord in bhtsne.bh_tsne(wordvec)] print("running kmeans") if(classes==None): clusters = kmeans_clusters(list(articles.keys()), wordvec) else: #Avoid kmeans if classes are already provided clusters = classes output_write(list(articles.keys()), coords, clusters) return lda_model
def analyze(path, h5_file): category = os.path.basename(h5_file.replace('.h5', '')) print('Processing category {}'.format(category)) data = pd.read_hdf(h5_file, 'data') try: tsne = np.array([ y for y in bh_tsne(np.vstack(data.state)) ]) plt.scatter(tsne[:,0], tsne[:,1]) plt.title(category) plt.savefig('{}/{}.png'.format(path, category), dpi=300) except Exception as e: print(e)
def word2vec_run(): raw_art = load_data_folder(text_dir) s = [] for sentence in gen_load(text_dir): s+=[sentence] W2V = models.Word2Vec """ All this should be configurable """ w2v = W2V(s, workers=4, window=5, min_count=3, size=WORDVEC_SIZE) wordvec = word2vectorize(w2v, raw_art) coords= [coord for coord in bhtsne.bh_tsne(wordvec)] clusters = kmeans_clusters(list(raw_art.keys()), wordvec) output_write(list(raw_art.keys()), coords, clusters) return coords
def plot_matrix(A, title=None, labels=None, vocab=None, fig=None): # cmap = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0.25, 0.75, 0], [0.25, 0, 0.75], [0, 0.5, 0.5], [0.75, 0.25, 0], [0.75, 0, 0.25], [0, 0.75, 0.25], [0, 0.25, 0.75]] cmap = [ [1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [0, 1, 1], ] # , [1, 0, 1], [1, 0.5, 0], [0.5, 0, 1], [0.5, 1, 0], [0.98, 0.39, 0]] cl = len(cmap) markers = ["o", "d", ">", (5, 1)] ml = len(markers) if not vocab: vocab = range(A.shape[0]) res = np.array([x for x in bh_tsne(A, verbose=True)]) if not fig: plt.figure() else: plt.figure(fig) if title: plt.title(title) # if all(labels) != None: # plt.scatter(res[:, 0], res[:, 1], s=20, c=labels, alpha=0.5) # else: # plt.scatter(res[:, 0], res[:, 1], s=20, alpha=0.5) for col in xrange(A.shape[1]): top_word = np.argmax(A[:, col]) mk = (col // cl) % ml colors = np.zeros((A.shape[0], 4)) colors[:, 0] = cmap[col % cl][0] colors[:, 1] = cmap[col % cl][1] colors[:, 2] = cmap[col % cl][2] colors[:, -1] = A[:, col] / A[top_word, col] plt.scatter(res[:, 0], res[:, 1], c=colors, marker=markers[mk], s=30, edgecolor="none") plt.scatter( res[top_word, 0], res[top_word, 1], c=cmap[col % cl], marker=markers[mk], s=30, edgecolor="none", label=u"тема #" + str(col), ) if all(vocab) != None: af = AnnoteFinder(res[:, 0], res[:, 1], vocab, xtol=0.1, ytol=0.1) plt.connect("button_press_event", af) plt.legend(scatterpoints=1, loc="best", ncol=3, fontsize=9) plt.draw() return res
def train(dataset = 'mnist.pkl.gz'): dataset = load_data(dataset) data = dataset[0][0].astype('float64') start_time = timeit.default_timer() results = np.zeros((data.shape[0], 2)) print('... training barnes-Hut tsne') for res, save in zip(bh_tsne(np.copy(data), theta = 0.5), results): save[...] = res end_time = timeit.default_timer() print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fs' % (end_time - start_time)), file = sys.stderr) with open(os.path.join(os.path.split(__file__)[0], 'data.pkl'), 'wb') as f: pickle.dump(results, f) results = results - np.min(results, axis = 0) results = results / np.max(results, axis = 0)
def plot_matrix(A, title=None, labels=None, vocab=None, fig=None): #cmap = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0.25, 0.75, 0], [0.25, 0, 0.75], [0, 0.5, 0.5], [0.75, 0.25, 0], [0.75, 0, 0.25], [0, 0.75, 0.25], [0, 0.25, 0.75]] cmap = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [0, 1, 1]]#, [1, 0, 1], [1, 0.5, 0], [0.5, 0, 1], [0.5, 1, 0], [0.98, 0.39, 0]] cl = len(cmap) markers = ['o', 'd', '>', (5,1)] ml = len(markers) if not vocab: vocab = range(A.shape[0]) res = np.array([x for x in bh_tsne(A, verbose=True)]) if not fig: plt.figure() else: plt.figure(fig) if title: plt.title(title) #if all(labels) != None: # plt.scatter(res[:, 0], res[:, 1], s=20, c=labels, alpha=0.5) #else: # plt.scatter(res[:, 0], res[:, 1], s=20, alpha=0.5) for col in xrange(A.shape[1]): top_word = np.argmax(A[:, col]) mk = (col // cl) % ml colors = np.zeros((A.shape[0], 4)) colors[:, 0] = cmap[col % cl][0] colors[:, 1] = cmap[col % cl][1] colors[:, 2] = cmap[col % cl][2] colors[:, -1] = (A[:, col] / A[top_word, col]) plt.scatter(res[:, 0], res[:, 1], c=colors, marker=markers[mk], s=30, edgecolor='none') plt.scatter(res[top_word, 0], res[top_word, 1], c=cmap[col % cl], marker=markers[mk], s=30, edgecolor='none', label=u'тема #'+str(col)) if all(vocab) != None: af = AnnoteFinder(res[:, 0], res[:, 1], vocab, xtol=0.1, ytol=0.1) plt.connect('button_press_event', af) plt.legend(scatterpoints=1, loc='best', ncol=3, fontsize=9) plt.draw() return res
def generate_vectors(json_input_filename, w2v_dim, perplexity, theta, pca_dims, dim=2): vectors = [] most_dominant_labels = [] image_ids = [] label_map = utils.load_json(json_input_filename, w2v_dim) for image_id, label in label_map.iteritems(): label_vectors = [] label_scores = [] label_desc = [] for val in label: label_vectors.append(val['word2vec']) label_scores.append(val['score']) label_desc.append( str(''.join(c for c in val['description'] if c in string.printable))) output_vec = word2vec.linear_combination_vectors( vectors=label_vectors, coefficients=label_scores) vectors.append(output_vec) most_dominant_labels.append(label_desc[0]) image_ids.append(image_id) embeddings = [] for result in bh_tsne(vectors, perplexity=perplexity, initial_dims=pca_dims, theta=theta, no_dims=dim): embeddings.append(result) embeddings = utils.scale_max_abs(embeddings) return embeddings, most_dominant_labels, image_ids
def reduce_tsne(D, to_dim=2): print('Reducing with t-SNE') return array([x for x in bh_tsne(D, verbose=True)])
def tsne(biembedsfn): data = get_data(biembedsfn) bhtsne.bh_tsne(data, no_dims=2)
words = [l[1] for l in lines] ids = [l[0] for l in lines] del lines import gc gc.collect() ############################################################## # tsne import sys sys.path.append('/home/ycao/third_party_src/bhtsne') from bhtsne import bh_tsne num_dims = 2 pca_dims = 50 perplexity = 50 theta = .5 tsne_out = list(bh_tsne(X, num_dims, pca_dims, perplexity, theta, verbose=True)) joblib.dump({ 'tsne': tsne_out, 'words': words, 'ids': ids }, 'proc_dir/tsne_out.pickle', compress=9) import pdb pdb.set_trace()
import bhtsne as bh import numpy as np from sklearn import datasets import matplotlib.pyplot as plt iris = datasets.load_iris() res = bh.bh_tsne(samples=iris['data'], perplexity=5, theta=0.15, verbose=True) fctr = list(res)[0] z = [y for (x, y) in fctr] z = np.asarray(z) z -= z.min(axis=0) z /= z.max(axis=0) plt.scatter(z[:, 0], z[:, 1]) for label, x, y in zip(iris['target'], z[:, 0], z[:, 1]): plt.annotate(label, xy=(x, y), xytext=(-10, 10), textcoords='offset points', ha='right', va='bottom', bbox=dict(boxstyle='round,pad=0.15', fc='yellow', alpha=0.3), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0')) plt.show()
plt.title(category) plt.savefig('{}/{}.png'.format(path, category), dpi=300) except Exception as e: print(e) files = glob.glob('{}/*.h5'.format(args.data_folder)) if args.all: data = [] labels = [] for i, h5_file in enumerate(files): _data = pd.read_hdf(h5_file, 'data').state[:args.n] data.extend(_data) labels.extend([i]*len(_data)) if args.components: pca = PCA(n_components=args.components) X = pca.fit_transform(np.vstack(data)) else: X = np.vstack(data) tsne = np.array([ y for y in bh_tsne(X) ]) #fix to say false to PCA to bh_tsne plt.scatter(tsne[:,0], tsne[:,1], c=labels) plt.savefig('{}/global_{}_components.png'.format(args.png_folder, args.components), dpi=300) else: par_analyze = partial(analyze, args.png_folder) pool = mp.Pool() pool.map(par_analyze, files)
import bhtsne as bh import numpy as np from sklearn import datasets import matplotlib.pyplot as plt iris = datasets.load_iris() res = bh.bh_tsne(samples=iris['data'], perplexity=5, theta=0.15, verbose=True) fctr = list(res)[0] z = [y for (x,y) in fctr] z = np.asarray(z) z -= z.min(axis=0) z /= z.max(axis=0) plt.scatter(z[:,0], z[:,1]) for label, x, y in zip(iris['target'], z[:, 0], z[:, 1]): plt.annotate(label,xy=(x, y), xytext=(-10, 10), textcoords='offset points', ha='right', va='bottom', bbox=dict(boxstyle = 'round,pad=0.15', fc = 'yellow', alpha = 0.3), arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0')) plt.show()