def coords_func(lib, opts, args): if opts.type: seq_features_file = config["blackbird"]["seq_features"].get("unicode") seq_features = cPickle.load(open(seq_features_file, "rb")) keys = seq_features.keys() if opts.type == "mean": features = np.empty((len(seq_features), 20)) for idx, key in enumerate(seq_features): length = seq_features[key].shape[1] features[idx, :] = seq_features[key][:, int(0.1 * length):int(0.9 * length)].mean(axis=1) elif opts.type == "lstm": print("Loading network...") model = LSTMSeq2Seq(config["blackbird"]["lstm"]["arch"].get("unicode"), config["blackbird"]["lstm"]["weights"].get("unicode"), config["blackbird"]["lstm"]["output"].get()) # Pad sequences maxlen = 150 padded_seq_features = np.empty((len(seq_features), maxlen, 20)) for idx, key in enumerate(seq_features): padded_seq_features[idx, :, :] = sequence.pad_sequences(seq_features[key], maxlen=maxlen, dtype="float32").T print("Getting vectors...") features = model.predict(padded_seq_features) else: print("Provide a valid --type [mean, lstm]") sys.exit(1) print("Reducing dimensions...") features_2d = TSNE(n_components=2).fit_transform(features) print("Writing to db...") conn = sqlite3.connect(config["blackbird"]["db"].get("unicode")) cur = conn.cursor() cur.execute("DELETE FROM coords") to_insert = [] for idx in xrange(features_2d.shape[0]): to_insert.append((keys[idx], features_2d[idx, 0], features_2d[idx, 1])) cur.executemany("INSERT INTO coords VALUES (?, ?, ?)", to_insert) conn.commit() conn.close() # Fill leftovers ids_to_fill = [] for item in lib.items(): if item.id not in keys: ids_to_fill.append(item.id) self.fill(ids_to_fill, features_2d.min(axis=0), features_2d.max(axis=0)) else: print("Provide a valid --type [mean, lstm]")
"""## t-SNE""" import matplotlib.pyplot as plt from matplotlib import offsetbox """Classify restaurants according to TF-IDF values""" from sklearn.manifold import TSNE # X = w2v_for_each_restaurant X = pd.read_csv("tf_idf_values.csv") rating = pd.read_csv("lratings_w2v.csv") co = rating['0'] # X = whole X_tsne = TSNE(n_components=2,perplexity=5,n_iter=100000,early_exaggeration=70,learning_rate=10,method="exact").fit_transform(X) x_min, x_max = X_tsne.min(0), X_tsne.max(0) X_norm = (X_tsne - x_min) / (x_max - x_min) y = reviews["restaurant"] plt.figure(figsize=(12, 12)) for i in range(X_norm.shape[0]): plt.text(X_norm[i, 0], X_norm[i, 1], y[i], color=plt.cm.Set1(co[i]+1), fontdict={'weight': 'bold', 'size': 12}) plt.xticks([]) plt.yticks([]) plt.show() """Classify restaurants based on the whole reviews""" # X = w2v_for_each_restaurant X = whole X_tsne = TSNE(n_components=2,perplexity=5,n_iter=100000,early_exaggeration=60,learning_rate=10,method="exact",init="pca").fit_transform(X)
def plot_transfer_embeddings(self): output_image_path = os.path.join(self.args.output_image_path,self.args.val_set,self.args.load_iteration) os.makedirs(output_image_path,exist_ok=True) speakers = [] utts = [] # in_test # self.samples = ['252', '240', '237', '341', '274', '236', '272', '329', '271', '301'] # out_test # self.samples = ['232', '305', '227', '238', '263', '339', '376', '318', '286', '312'] for speaker in self.samples: speakers += [speaker] * len(self.indexes[speaker]) utts += self.indexes[speaker] use_spec = 'dmel' if self.args.model_type == 'AdaVAEd' else 'mel' dataset = TransferDateset(os.path.join(self.args.data_dir, self.args.dataset), speakers, utts, self.indexes, segment_size=None, load_spectrogram=use_spec) dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True) embs = [] embs_tran = [] for data in dataloader: spec = cc(data['spectrogram']) spec_d = cc(data['dmel']) spec_src = cc(data['source']) use_spec = spec_d if self.args.model_type == 'AdaVAEd' else spec emb = self.model.get_speaker_embeddings(use_spec) with torch.no_grad(): spec_tran = self.model.inference(spec_src, spec) emb_tran = self.model.get_speaker_embeddings(spec_tran) embs += emb.detach().cpu().numpy().tolist() embs_tran += emb_tran.detach().cpu().numpy().tolist() print('Evaluate: {}/{}'.format(len(embs),len(dataloader)),end='\r') embs_all = embs + embs_tran embs_all = np.array(embs_all) norms = np.sqrt(np.sum(embs_all ** 2, axis=1, keepdims=True)) embs_all = embs_all / norms # t-SNE print('\nt-SNE...') embs_2d = TSNE(n_components=2, init='pca', perplexity=50).fit_transform(embs_all) x_min, x_max = embs_2d.min(0), embs_2d.max(0) embs_2d = (embs_2d - x_min) / (x_max - x_min) embs_2d_src = embs_2d[:len(embs)] embs_2d_tran = embs_2d[len(embs):] # plot to figure female_cluster = [i for i, speaker in enumerate(speakers) if self.speaker_infos[speaker][1] == 'F'] male_cluster = [i for i, speaker in enumerate(speakers) if self.speaker_infos[speaker][1] == 'M'] colors = np.array([self.samples_index[speaker] for speaker in speakers]) plt.scatter(embs_2d_src[female_cluster, 0], embs_2d_src[female_cluster, 1], c=colors[female_cluster], marker='x') plt.scatter(embs_2d_src[male_cluster, 0], embs_2d_src[male_cluster, 1], c=colors[male_cluster], marker='o') plt.scatter(embs_2d_tran[female_cluster, 0], embs_2d_tran[female_cluster, 1], c=colors[female_cluster], marker='s') plt.scatter(embs_2d_tran[male_cluster, 0], embs_2d_tran[male_cluster, 1], c=colors[male_cluster], marker='+') plt.savefig(os.path.join(output_image_path,'transfer.png')) plt.clf() plt.cla() plt.close() return
alpha = 0.5 hit = np.append(train.rowmeta['Tissue'] == tissue, valid.rowmeta['Tissue'] == tissue) ax.plot(T[hit, 0], T[hit, 1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label=tissue) ax.set_xlim(T.min(0)[0], 1.5 * (T.max(0)[0] - T.min(0)[0]) - T.max(0)[0]) #ax.set_ylim(T.min(0)[1], T.max(0)[1]+1*(T.max(0)[1]-T.min(0)[1])) ax.legend(loc='best', ncol=2, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.1) ax.tick_params(axis='both', which='major', bottom='off', top='off', labelbottom='off', labeltop='off', left='off', right='off',
def main_train(self): with tf.Graph().as_default(): with tf.Session() as sess: img_data = facenet.get_dataset(self.datadir) path, label = facenet.get_image_paths_and_labels(img_data) print("label") print(label) print('Classes: %d' % len(img_data)) print('Images: %d' % len(path)) facenet.load_model(self.modeldir) images_placeholder = tf.get_default_graph().get_tensor_by_name( "input:0") embeddings = tf.get_default_graph().get_tensor_by_name( "embeddings:0") phase_train_placeholder = tf.get_default_graph( ).get_tensor_by_name("phase_train:0") embedding_size = embeddings.get_shape()[1] print('Extracting features of images for model') batch_size = 1000 image_size = 160 nrof_images = len(path) nrof_batches_per_epoch = int( math.ceil(1.0 * nrof_images / batch_size)) emb_array = np.zeros((nrof_images, embedding_size)) #print(nrof_batches_per_epoch) #for i in range(nrof_batches_per_epoch): start_index = 0 * batch_size end_index = min((0 + 1) * batch_size, nrof_images) paths_batch = path[start_index:end_index] images = facenet.load_data(paths_batch, False, False, image_size) feed_dict = { images_placeholder: images, phase_train_placeholder: False } emb_array[start_index:end_index, :] = sess.run( embeddings, feed_dict=feed_dict) class_names = [cls.name.replace('_', ' ') for cls in img_data] classifier_file_name = os.path.expanduser( self.classifier_filename) print('emb_array') print(emb_array) X_embedded = TSNE(n_components=2).fit_transform(emb_array) X_embedded -= X_embedded.min(axis=0) X_embedded /= X_embedded.max(axis=0) print("X_embedded") print(X_embedded) #for i in range(0, nrof_images-1): # plt.plot(X_embedded[i, 0], X_embedded[i, 1],'bo') plt.legend(bbox_to_anchor=(1, 1)) plt.show() out_dim = 8 out_res = 160 to_plot = np.square(out_dim) grid = np.dstack( np.meshgrid(np.linspace(0, 1, out_dim), np.linspace(0, 1, out_dim))).reshape(-1, 2) cost_matrix = cdist(grid, X_embedded, "sqeuclidean").astype(np.float32) cost_matrix = cost_matrix * (100000 / cost_matrix.max()) print(cost_matrix) #rids, cids = solve_dense(costs) #print(rids) row_ind, col_ind = linear_sum_assignment(cost_matrix) row_asses, col_asses, _ = lapjv(cost_matrix) print("To cos") print(col_asses) print("teraz to!") print(row_ind) print(col_ind) for r, c in zip(row_ind, col_asses): print(r, c) # Row/column pairings grid_jv = grid[col_asses] out = np.ones((out_dim * out_res, out_dim * out_res, 3)) print(grid_jv) for pos, img in zip(grid_jv, images[0:to_plot]): h_range = int(np.floor(pos[0] * (out_dim - 1) * out_res)) w_range = int(np.floor(pos[1] * (out_dim - 1) * out_res)) out[h_range:h_range + out_res, w_range:w_range + out_res] = image.img_to_array(img) print(out) im = image.array_to_img(out) im.save("obrazek.jpg", quality=95)
from sklearn.manifold import TSNE import numpy as np import json with open('mv_data.json') as f: j = json.loads(f.read()) def distance(a, b): return np.linalg.norm(a.reshape((40, 40))-b.reshape((40, 40))) data = np.array(list(map(lambda x: np.array(x).flatten(), map(lambda x: x['viewMatrix'], j['papers'])))) embed = TSNE(metric=distance).fit_transform(data) embed -= embed.min(axis=0) embed /= embed.max(axis=0) embed *= 2 embed -= 1 with open('tsne.json', 'w') as f: f.write(json.dumps(embed.tolist()))
def plot_segment_embeddings(self): output_image_path = os.path.join(self.args.output_image_path, self.args.val_set, self.args.load_iteration) os.makedirs(output_image_path, exist_ok=True) speakers = [] utts = [] for speaker in self.samples: speakers += [speaker] * len(self.indexes[speaker]) utts += self.indexes[speaker] dataset = EvaluateDateset( os.path.join(self.args.data_dir, self.args.dataset), speakers, utts, segment_size=self.config['data_loader']['segment_size'], dset=self.args.val_set, load_spectrogram='dmel') dataloader = DataLoader(dataset, batch_size=128, shuffle=False, num_workers=0, pin_memory=True) batchiter = infinite_iter(dataloader) embs = [] speakers = [] # run the model while (len(embs) < self.args.n_segments): data = next(batchiter) speakers += data['speaker'] data = cc(data['spectrogram'].permute(0, 2, 1)) emb = self.model.get_speaker_embeddings(data) embs += emb.detach().cpu().numpy().tolist() print('Evaluate: {}/{}'.format(len(embs), self.args.n_segments), end='\r') embs = np.array(embs) norms = np.sqrt(np.sum(embs**2, axis=1, keepdims=True)) embs = embs / norms # t-SNE print('\nt-SNE...') embs_2d = TSNE(n_components=2, init='pca', perplexity=50).fit_transform(embs) x_min, x_max = embs_2d.min(0), embs_2d.max(0) embs_2d = (embs_2d - x_min) / (x_max - x_min) # plot to figure female_cluster = [ i for i, speaker in enumerate(speakers) if self.speaker_infos[speaker][1] == 'F' ] male_cluster = [ i for i, speaker in enumerate(speakers) if self.speaker_infos[speaker][1] == 'M' ] colors = np.array( [self.samples_index[speaker] for speaker in speakers]) plt.scatter(embs_2d[female_cluster, 0], embs_2d[female_cluster, 1], c=colors[female_cluster], marker='x') plt.scatter(embs_2d[male_cluster, 0], embs_2d[male_cluster, 1], c=colors[male_cluster], marker='o') plt.savefig(os.path.join(output_image_path, 'segment.png')) plt.clf() plt.cla() plt.close() return
def convert_to_dict(clusters_to_filter, ru_idfs, fi_idfs, start_time): print(start_time) if isinstance(clusters_to_filter, dict): clusters_to_filter = clusters_to_filter.values() clusters_to_save = filter_interesting_clusters(clusters_to_filter) json_formatted = [] cdata = [c.center / c.norm for c in clusters_to_save] if len(cdata) < 5: return json_formatted t_sne_space = TSNE(n_components=2, metric='cosine').fit_transform(cdata) # normalize T-SNE space to -1 to 1 minimums = t_sne_space.min(axis=0) maximums = t_sne_space.max(axis=0) for v in t_sne_space: v[0] = 2 * (v[0] - minimums[0]) / (maximums[0] - minimums[0]) - 1 v[1] = 2 * (v[1] - minimums[1]) / (maximums[1] - minimums[1]) - 1 for cluster_index in range(len(clusters_to_save)): c = clusters_to_save[cluster_index] idfs = ru_idfs if c.lang == 'ru' else fi_idfs # TODO remove temporary filtering # if (c.created_at < (start_time - len(c.hourly_growth_rate) * 3600 * 1000) ): #1405555200000): # 17/07/2014 00:00:00 continue #if (c.created_at < 1503014400000): # 18/08/2017 00:00:00 #continue if len(c.hourly_growth_rate) < 1: continue start_idx = max(int((c.created_at - start_time) / 3600 / 1000), 1) for i in range(start_idx, len(c.hourly_growth_rate)): update = {} # timestamp update['t'] = int(c.first_growth_time / 1000) + i * 60 * 60 # start with a new cluster event if i == start_idx: total_sentiment = c.hourly_accum_sentiment[ len(c.hourly_accum_sentiment) - 1] tags = c.hourly_tags[len(c.hourly_tags) - 1] if tags is not None: tags = [ tag_label_overrides.get(t, t.title()) for t in tags ] #get_keywords(c, idfs)[:4], update['n'] = {c.id: \ { \ 's': round(c.hourly_growth_rate[i]), \ 'k': c.hourly_keywords[i], \ 'lang': c.lang, \ 'sentiment': round(c.hourly_sentiment[i], 3), \ 'sentiment_total': round(total_sentiment, 3), \ 'tags': tags if tags is not None else [], \ 't_sne': [float(t_sne_space[cluster_index][0]), \ float(t_sne_space[cluster_index][1])] \ } \ } elif i == len(c.hourly_growth_rate): # insert a negative number at the end to mark the end of the cluster update['u'] = {c.id: {'s': -1}} else: update['u'] = { c.id: { 's': int(round(c.hourly_growth_rate[i])), 'sentiment': round(c.hourly_sentiment[i], 3), 'sentiment_accum': round(c.hourly_accum_sentiment[i], 3), 'k': c.hourly_keywords[i - 1] } } json_formatted.append(update) json_formatted.sort(key=lambda update: update['t']) return json_formatted