def svm_robust_score(args, data, labels): idx_for_split = int(0.2 * len(data)) phrases = [] pred = [] for index, row in data.iterrows(): phrases.append( noise_generator(row["sentence1"], args.noise_level, chars)) phrases.append( noise_generator(row["sentence2"], args.noise_level, chars)) from sample import sample_multi results = np.squeeze( np.vsplit(sample_multi(args.save_dir, phrases, args.model_type), len(phrases))) for i in range(0, len(results), 2): v1 = results[i] v2 = results[i + 1] if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all(): print(i) pred.append(1 - cosine(v1, v2)) if math.isnan(pred[-1]): pred[-1] = 0.5 pr = pd.DataFrame(pred) train = pr.iloc[idx_for_split:] test = pr.iloc[:idx_for_split] train_label = labels[idx_for_split:] test_label = labels[:idx_for_split] roc_auc = linear_svm(train, test, train_label, test_label) with open("results_entail_" + args.model_type + ".txt", "at") as f_out: # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred))) f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc))
def get_robust_score(args, pairs, true): if "robust" in args.mode: pred = [] phrases = [] for index, row in pairs.iterrows(): phrases.append( noise_generator(row["sentence1"], args.noise_level, chars)) phrases.append( noise_generator(row["sentence2"], args.noise_level, chars)) from sample import sample_multi results = np.vsplit( sample_multi(args.save_dir, phrases, args.model_type), len(phrases)) for i in range(0, len(results), 2): v1 = results[i] v2 = results[i + 1] if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all(): print(i) pred.append(1 - cosine(v1, v2)) if math.isnan(pred[-1]): pred[-1] = 0.5 with open("results_entail" + args.model_type + ".txt", "at") as f_out: # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred))) f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred)))
def samping_sentiment_data(args, data, labels): idx_for_split = int(0.2 * len(data)) results = np.squeeze(np.vsplit(sample_multi(args.save_dir, data, args.model_type), len(data))) train = results[idx_for_split:] test = results[:idx_for_split] train_label = labels[idx_for_split:] test_label = labels[:idx_for_split] # keras_test(train, test, train_label, test_label) roc_auc_score = linear_svm(train, test, train_label, test_label) with open(args.model_type + "_results_sentiment.txt", "at") as f_out: f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, roc_auc_score))
def svm_robust_score(args,data, labels): idx_for_split = int(0.2 * len(data)) phrases = [] pred = [] for index, row in data.iterrows(): phrases.append(noise_generator(row["sentence1"], args.noise_level, chars)) phrases.append(noise_generator(row["sentence2"], args.noise_level, chars)) from sample import sample_multi results = np.squeeze(np.vsplit(sample_multi(args.save_dir, phrases, args.model_type), len(phrases))) #pairs_vectors = zip(results[0::2], results[1::2]) # df = pd.DataFrame(columns=['cosine', 'canberra', 'cityblock', 'euclidean', 'minkowski', 'braycurtis',"skew_q1","skew_q2"\ # "kur_q1","kur_q2", "skew_diff", "kur_diff"]) # df['cosine'] = [cosine(x, y) for (x, y) in pairs_vectors] # print(len(df)) # df['canberra'] = [canberra(x, y) for (x, y) in zip(results[0::2], results[1::2])] # df['cityblock'] = [cityblock(x, y) for (x, y) in zip(results[0::2], results[1::2])] # df['euclidean'] = [euclidean(x, y) for (x, y) in zip(results[0::2], results[1::2])] # df['minkowski'] = [minkowski(x, y, 3) for (x, y) in zip(results[0::2], results[1::2])] # df['braycurtis'] = [braycurtis(x, y) for (x, y) in zip(results[0::2], results[1::2])] # question1_vec = results[0::2] # question2_vec = results[1::2] # data['skew_q1'] = [skew(x) for x in question1_vec] # data['skew_q2'] = [skew(x) for x in question2_vec] # data['kur_q1'] = [kurtosis(x) for x in question1_vec] # data['kur_q2'] = [kurtosis(x) for x in question2_vec] # # data['skew_diff'] = np.abs(data['skew_q1'] - data['skew_q2']) # data['kur_diff'] = np.abs(data['kur_q1'] - data['kur_q2']) for i in range(0, len(results), 2): v1 = results[i] v2 = results[i + 1] if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all(): print(i) pred.append(1 - cosine(v1, v2)) if math.isnan(pred[-1]): pred[-1] = 0.5 # pr = pd.DataFrame(pred) # train = df.iloc[idx_for_split:] # test = df.iloc[:idx_for_split] # train_label = labels[idx_for_split:] # test_label = labels[:idx_for_split] roc_auc= roc_auc_score(labels, pred) # clf = catboost.CatBoostClassifier(depth=6, iterations=5000, learning_rate=0.1, thread_count=16) # clf.fit(train, train_label) # y_proba = clf.predict_proba(test)[:, 1] # roc_auc = roc_auc_score(test_label, y_proba) with open("results_quora" + args.model_type + ".txt", "at") as f_out: # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred))) f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc))
if lemma in w2v: vector = w2v[lemma] vectors.append(vector) return np.mean(vectors, axis=0) for pair in tqdm(pairs): v1 = get_mean_vec(noise_generator(pair["text_1"])) v2 = get_mean_vec(noise_generator(pair["text_2"])) pred.append(1 - cosine(v1, v2)) with open("results.txt", "at") as f_out: f_out.write("word2vec,%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred))) # print "ROC\t\t=\t%.2f" % roc_auc_score(true, pred) if "robust" in args.mode: pred = [] phrases = [] for pair in pairs: phrases.append(noise_generator(pair["text_1"])) phrases.append(noise_generator(pair["text_2"])) from sample import sample_multi results = np.vsplit(sample_multi(args.save_dir, phrases), len(phrases)) for i in range(0, len(results), 2): v1 = results[i] v2 = results[i + 1] pred.append(1 - cosine(v1, v2)) with open("results.txt", "at") as f_out: f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred))) # print "ROC\t\t=\t%.2f" % roc_auc_score(true, pred) # print "Class ratio\t=\t%.2f" % (float(len(filter(None, true)))/len(true)) # print "F1\t=\t%.2f" % f1_score(true, pred)
type=str, default="save") parser.add_argument("-t", "--model_type", help="type of model used to train", type=str, default="biSRU") parser.add_argument("-i", "--input-dir", help="dir to go through") args = parser.parse_args() with open(join(args.save_dir, 'chars_vocab.pkl'), 'rb') as f: chars, _ = cPickle.load(f) if "robust" in args.mode: filenames = [] phrases = [] for filename in glob(join(args.input_dir, "*.txt")): filenames.append(filename) with open(filename, "rt") as f: lines = [line.strip() for line in f.readlines()] phrases.append(" ".join(lines)) from sample import sample_multi results = np.vsplit(sample_multi(args.save_dir, phrases, args.model_type), len(phrases)) for i in range(len(results)): np.savetxt(filenames[i] + ".rove", results[i])
def main(data_dir, template, output, save_dir): clusters = {} clustered = {} print('Load clustermap') with open(os.path.join(data_dir, 'SBEADS.resC'), 'r') as fin: for line in tqdm(fin): tab = line.index('\t') cluster_id = int(line[:tab]) ids = [int(t) for t in line[tab + 1:].split()] for i in ids: clustered[i] = cluster_id clusters[cluster_id] = ids print('Load messages') data = [] with open(os.path.join(data_dir, 'docs.out'), 'r') as fin: for line in tqdm(fin): # take care about escape symbols filtered = line.replace(r"\'", "'").replace('\\', '/') try: entry = json.loads(filtered) id = int(entry.get('id')) title = entry.get('title') data.append((id, title)) except: print(filtered) word2vectors = sample_multi(save_dir, [t[1] for t in data]) vectors = np.zeros((len(data), word2vectors[0][0].shape[2])) for i in range(len(word2vectors)): vectors[i, :] = np.mean(np.squeeze(np.array(word2vectors[i])), axis=0) X = vectors print('Compute natural w2v clusterization quality.') # ATTENTION: very dirty code, just let it works n = 100000 clust_numbers = list(clusters.keys()) selected = np.random.choice(clust_numbers, n) res = [] # not sure if they are continuous: indexes = {t[0]: i for (i, t) in enumerate(data)} misscounter = 0 for i in tqdm(selected): tmp = [] for j in clusters[i]: if j in indexes: tmp.append(indexes[j]) else: misscounter += 1 if len(tmp) < 2: # bad, let it go # print('Pass') continue # get pair pair = np.random.choice(tmp, 2) one = X[pair[0], :] two = X[pair[1], :] sim = 1.0 - spatial.distance.cosine(one, two) if np.isnan(sim) or np.isinf(sim): sim = 0.0 res.append((1.0, sim)) # (try to) get nonpair one = np.random.choice(tmp, 1) two = np.random.random_integers(0, len(X) - 1, 1) gnd = 1.0 * (two in tmp) one = X[one, :] two = X[two, :] sim = 1.0 - spatial.distance.cosine(one, two) if np.isnan(sim) or np.isinf(sim): sim = 0.0 res.append((gnd, sim)) res = np.array(res) print("Classes ratio:\t%.3f" % (sum(res[:, 0]) / len(res))) print("MSE:\t\t%.3f" % mean_squared_error(res[:, 0], res[:, 1])) print("AUC:\t\t%.3f" % roc_auc_score(res[:, 0], res[:, 1])) # last result was # Classes ratio: 0.500 # MSE: 0.106 # AUC: 0.968 # End of ATTENTION labels = np.array([clustered.get(t[0], -1) for t in data]) score = silhouette_score(X, labels, sample_size=1000) # it gives me about 0.77 print('Natural w2v silhouette_score is {}'.format(score)) idx = (labels > -1) score = silhouette_score(X[idx], labels[idx], sample_size=1000) # it gives me about 0.87 print('Without unclustered samples score is {}'.format(score)) # Preparation for visualization. Unfinished yet print('Compute 2d projection') pca = PCA(n_components=2) X2 = pca.fit_transform(X) print('Save the data') repack = [] with open('data.csv', 'w') as fout: for i, x in zip([t[0] for t in data], X2): q = (x[0], x[1], i, clustered.get(i, -1)) fout.write('{:.2f},{:.2f},{},{}\n'.format(*q)) repack.append(q) repack = json.dumps(repack) # Experiment with coarsed coordinates: d1 = len(set(['{:.1f},{:.1f}'.format(x[0], x[1]) for x in X2])) d2 = len(set(['{:.2f},{:.2f}'.format(x[0], x[1]) for x in X2])) d3 = len(set(['{:.3f},{:.3f}'.format(x[0], x[1]) for x in X2])) print('We can coarse the data: ') print(d1) print(d2) print(d3) with open(template, 'r') as fin: page = Template(fin.read()).render(data=repack) with open(output, 'w') as fout: fout.write(page) with open('labels.csv', 'w') as fout: for i, label in data: fout.write('{}\t{}\n'.format(i, label)) print('Done')
# king_queen = 1 - cosine(king, queen) # # man_king = 1 - cosine(man, king) # woman_queen = 1 - cosine(woman, queen) # # print("Distances") # print("woman_man:{} \n king_queen:{} \n".format(woman_man, king_queen)) # print("man_king:{} \n woman_queen:{} \n".format(man_king, woman_queen)) # # print("Queen similarity") # print(1 - cosine(king - man + woman, queen)) pos1 = "You have no affinity for most of the characters ." pos2 = "The characters , cast in impossibly contrived situations , are totally estranged from reality ." neg1 = "Everybody loves a David and Goliath story , and this one is told almost entirely from David 's point of view ." neg2 = "Those who want to be jolted out of their gourd should drop everything and run to Ichi." positive = [pos1, pos2] negative = [neg1, neg2] vec = positive + negative print(len(vec)) results = sample_multi(DEFAULT_MODEL, vec, "biLSTM") pos_pos = 1 - cosine(results[0], results[1]) neg_neg = 1 - cosine(results[2], results[3]) pos_neg = 1 - cosine(results[1], results[3]) neg_pos = 1 - cosine(results[2], results[0]) print("pos_pos {}".format(pos_pos)) print("neg {}".format(neg_neg)) print("neg_pos {}".format(neg_pos)) print("pos_neg {}".format(pos_neg))