def main(): """ Runs main experiments using self supervised alignment. """ # wv_source = "wordvectors/latin/corpus1/0.vec" # wv_target = "wordvectors/latin/corpus2/0.vec" # wv_source = "wordvectors/source/theguardianuk.vec" # wv_target = "wordvectors/source/thenewyorktimes_1.vec" wv_source = "wordvectors/semeval/latin-corpus1.vec" wv_target = "wordvectors/semeval/latin-corpus2.vec" # wv_source = "wordvectors/usuk/bnc.vec" # wv_target = "wordvectors/usuk/coca_mag.vec" # wv_source = "wordvectors/artificial/NYT-0.vec" # wv_target = "wordvectors/artificial/NYT-500_random.vec" plt.style.use("seaborn") # Read WordVectors normalized = False wv1 = WordVectors(input_file=wv_source, normalized=normalized) wv2 = WordVectors(input_file=wv_target, normalized=normalized) wv1, wv2 = intersection(wv1, wv2) landmarks, non_landmarks, Q = s4(wv1, wv2, cls_model="nn", n_targets=100, n_negatives=100, rate=1, t=0.5, iters=100, verbose=1, plot=1) wv1, wv2, Q = align(wv1, wv2, anchor_words=landmarks) d_l = [cosine(wv1[w], wv2[w]) for w in landmarks] d_n = [cosine(wv1[w], wv2[w]) for w in non_landmarks] sns.distplot(d_l, color="blue") sns.distplot(d_n, color="red") plt.legend() plt.show()
def main(): """ Performs tests on SemEval2020-Task 1 data on Unsupervised Lexical Semantic Change Detection. This experiments is designed to evaluate the performance of different landmark selection approaches, showing how the classification performance is affected by the landmark choices. """ np.random.seed(1) align_methods = [ "s4", "noise-aware", "top-10", "bot-10", "global", "top-5", "bot-5" ] parser = argparse.ArgumentParser() parser.add_argument("--languages", nargs="+", help="Languages to use", default=["english", "german", "latin", "swedish"]) parser.add_argument("--cls", choices=["cosine", "s4", "cosine-auto"], default="cosine", help="Classifier to use") args = parser.parse_args() languages = args.languages classifier = args.cls align_params = \ { "english" : { "n_targets": 100, "n_negatives": 50, "rate": 1, "iters": 100 }, "german" : { "n_targets": 100, "n_negatives": 200, "rate": 1, "iters": 100 }, "latin" : { "n_targets": 10, "n_negatives": 4, "rate": 0.5, "iters": 100 }, "swedish" : { "n_targets": 100, "n_negatives": 200, "rate": 1, "iters": 100 } } cls_params = \ { "english": { "n_targets": 100, "n_negatives": 50, "rate": 1, "iters": 500 }, "german":{ "n_targets": 50, "n_negatives": 200 }, "latin": { "n_targets": 50, "n_negatives": 10 }, "swedish": { "n_targets": 120, "n_negatives": 120 } } auto_params = \ { "english": { "rate": 1.5, "n_fold": 1, "n_targets": 50, "n_negatives": 100 }, "german": { "rate":1, "n_fold": 1, "n_targets": 200, "n_negatives": 100 }, "latin": { "rate": 1, "n_targets": 100, "n_negatives": 15 }, "swedish": { "rate": 1, "n_targets": 100, "n_negatives": 200 } } normalized = False accuracies = defaultdict(dict) true_positives = defaultdict(dict) false_negatives = defaultdict(dict) correct_ans = defaultdict(dict) cm = defaultdict(dict) for lang in languages: # print("---") # print(lang) t = 0.5 thresholds = np.arange(0.1, 1, 0.1) path_task1 = "data/semeval/truth/%s.txt" % lang path_task2 = "data/semeval/truth/%s.txt" % lang with open(path_task1) as fin: data = map(lambda s: s.strip().split("\t"), fin.readlines()) targets, true_class = zip(*data) y_true = np.array(true_class, dtype=int) with open(path_task2) as fin: data = map(lambda s: s.strip().split("\t"), fin.readlines()) _, true_ranking = zip(*data) true_ranking = np.array(true_ranking, dtype=float) corpus1_path = "wordvectors/semeval/%s-corpus1.vec" % lang corpus2_path = "wordvectors/semeval/%s-corpus2.vec" % lang wv1 = WordVectors(input_file=corpus1_path, normalized=normalized) wv2 = WordVectors(input_file=corpus2_path, normalized=normalized) c_method = defaultdict(list) wv1, wv2 = intersection(wv1, wv2) # print("Size of common vocab.", len(wv1)) prediction = dict() # store per-word prediction for align_method in align_methods: accuracies[align_method][lang] = list() true_positives[align_method][lang] = list() false_negatives[align_method][lang] = list() cm[align_method][lang] = np.zeros((2, 2)) if align_method == "global": landmarks = wv1.words elif align_method == "noise-aware": Q, alpha, landmarks, non_landmarks = noise_aware( wv1.vectors, wv2.vectors) landmarks = [wv1.words[i] for i in landmarks] elif align_method == "s4": landmarks, non_landmarks, Q = s4( wv1, wv2, cls_model="nn", verbose=0, **align_params[lang], ) elif align_method == "top-10": landmarks = wv1.words[int(len(wv1.words) * 0.1):] elif align_method == "top-5": landmarks = wv1.words[int(len(wv1.words) * 0.05):] elif align_method == "top-50": landmarks = wv1.words[int(len(wv1.words) * 0.50):] elif align_method == "bot-10": landmarks = wv1.words[-int(len(wv1.words) * 0.1):] elif align_method == "bot-5": landmarks = wv1.words[-int(len(wv1.words) * 0.05):] elif align_method == "bot-50": landmarks = wv1.words[-int(len(wv1.words) * 0.50):] wv1_, wv2_, Q = align(wv1, wv2, anchor_words=landmarks) # Cosine-based classifier if classifier == "cosine": x = np.array([cosine(wv1_[w], wv2_[w]) for w in wv1.words]) x = get_feature_cdf(x) x = np.array([x[wv1.word_id[i.lower()]] for i in targets]) p = x.reshape(-1, 1) r = vote(p) y_pred = r best_acc = 0 for t in thresholds: y_bin = (y_pred > t) correct = (y_bin == y_true) accuracy = accuracy_score(y_true, y_bin) if accuracy > best_acc: prediction[align_method] = correct best_acc = accuracy tn, fp, fn, tp = confusion_matrix(y_true, y_bin).ravel() cm[align_method][lang] += confusion_matrix(y_true, y_bin, normalize="all") accuracies[align_method][lang].append(round(accuracy, 2)) true_positives[align_method][lang].append(round(tp, 2)) false_negatives[align_method][lang].append(round(fn, 2)) elif classifier == "cosine-auto": t_cos = threshold_crossvalidation(wv1_, wv2_, iters=1, **auto_params[lang], landmarks=landmarks) x = np.array([cosine(wv1_[w], wv2_[w]) for w in wv1.words]) x = get_feature_cdf(x) x = np.array([x[wv1.word_id[i.lower()]] for i in targets]) p = x.reshape(-1, 1) r = vote(p) y_pred = r y_bin = y_pred > t_cos correct = (y_bin == y_true) accuracy = accuracy_score(y_true, y_bin) accuracies[align_method][lang].append(round(accuracy, 2)) elif classifier == "s4": model = s4(wv1_, wv2_, landmarks=landmarks, verbose=0, **cls_params[lang], update_landmarks=False) # Concatenate vectors of target words for prediction x = np.array([ np.concatenate((wv1_[t.lower()], wv2_[t.lower()])) for t in targets ]) y_pred = model.predict(x) y_bin = y_pred > 0.5 correct = (y_bin == y_true) accuracy = accuracy_score(y_true, y_bin) print(accuracy) accuracies[align_method][lang].append(round(accuracy, 2)) c_method[align_method] = y_pred rho, pvalue = spearmanr(true_ranking, y_pred) # print(lang, align_method, "acc", accuracies[align_method][lang], # "\nranking", round(rho, 2), # "landmarks", len(landmarks)) print("|Method|Language|Mean acc.|Max acc.|") print("|------|--------|---------|--------|") for method in accuracies: print("|", method, end="|") for lang in accuracies[method]: print(lang, round(np.mean(accuracies[method][lang]), 2), np.max(accuracies[method][lang]), sep="|", end="|\n") print()
def main(): """ The following experiments are available: - Find most stable words in each ArXiv category (cs, math, cond-mat, physics) - Find most unstable (changed) words in earch category - Finds stable/unstable words across categories - Using different alignment strategies """ parser = argparse.ArgumentParser() parser.add_argument("cat1", type=str, help="Name of first arXiv category") parser.add_argument("cat2", type=str, help="Name of second arXiv category") args = parser.parse_args() cat1 = args.cat1 cat2 = args.cat2 cat1_name = cat1.split("/")[-1] cat2_name = cat2.split("/")[-1] # cat1_name = cat1.split("_")[2].rstrip(".vec") # cat2_name = cat2.split("_")[2].rstrip(".vec") path_out = "results/arxiv/" wva = WordVectors(input_file=cat1) wvb = WordVectors(input_file=cat2) wva, wvb = intersection(wva, wvb) wva, wvb, Q = align(wva, wvb) words = wva.words print("-- Common vocab", len(words)) # each column of this matrix will store a set of results for a method out_grid = np.zeros((len(words), 5)) d = distribution_of_change(wva, wvb) print("====== GLOBAL") print("=> landmarks", len(wva.words)) print_table(d, wva.words) out_grid[:, 0] = d # add first column print("====== Noise Aware") Q, alpha, landmarks, noisy = noise_aware(wva.vectors, wvb.vectors) wva, wvb, Q = align(wva, wvb, anchor_words=landmarks) print("=> landmarks", len(landmarks)) d = distribution_of_change(wva, wvb) print_table(d, wva.words) out_grid[:, 1] = d # add new column print("===== SELF") landmarks, nonl, Q = s4(wva, wvb, iters=100, verbose=1) wva, wvb, Q = align(wva, wvb, anchor_words=landmarks) d = distribution_of_change(wva, wvb) print_table(d, wva.words) out_grid[:, 2] = d # last column # WRITE-OUT with open(os.path.join(path_out, "%s-%s.csv" % (cat1_name, cat2_name)), "w") as fout: fout.write("word,global,noise-aware,self,top,bot\n") for i, w in enumerate(words): fout.write("%s,%.3f,%.3f,%.3f,%.3f,%.3f\n" % (w, out_grid[i][0], out_grid[i][1], out_grid[i][2], out_grid[i][3], out_grid[i][4]))
def main(): parser = argparse.ArgumentParser() parser.add_argument("alignment", choices=[ 'top-5', 'top-10', 'noise-aware', 'bot-5', 'bot-10', 'global', 's4' ], default="top", help="Method to use in the alignment of UK to US") parser.add_argument("--rounds", type=int, default=1, help="No. of rounds to run the classifications") args = parser.parse_args() path_us = "wordvectors/ukus/coca.vec" path_uk = "wordvectors/ukus/bnc.vec" path_dict = "data/ukus/dict_similar.txt" path_dict_dis = "data/ukus/dict_dissimilar.txt" normalized = False wv1 = WordVectors(input_file=path_uk, normalized=normalized) wv2 = WordVectors(input_file=path_us, normalized=normalized) wv_uk, wv_us = intersection(wv1, wv2) # Load dictionaries of words with open(path_dict) as fin: dico_sim = list(map(lambda s: s.strip().split(" ", 1), fin.readlines())) with open(path_dict_dis) as fin: dico_dis = list(map(lambda s: (s.strip(), s.strip()), fin.readlines())) # Filter words not in the vocabulry of either UK or US corpora dico_sim = [(a, b) for a, b in dico_sim if a in wv_uk.word_id and b in wv_us.word_id] dico_dis = [(a, b) for a, b in dico_dis if a in wv_uk.word_id and b in wv_us.word_id] dico = dico_sim + dico_dis # Create true labels for terms # 0 -> similar | 1 -> dissimilar y_true = [0] * len(dico_sim) + [1] * len(dico_dis) m = args.alignment # Align wordvectors (using any alignment approach) if m == "noise-aware": Q, alpha, landmarks, noise = noise_aware(wv_uk.vectors, wv_us.vectors) landmarks = [wv_uk.words[i] for i in landmarks] a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) elif m == "global": landmarks = wv_us.words a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) landmarks = landmarks[:len(landmarks) // 2] elif m == "s4": landmarks = wv_us.words a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) landmarks, non_landmarks, Q = s4( wv_uk, wv_us, cls_model="nn", verbose=0, iters=100, n_targets=100, n_negatives=10, rate=0.25, ) a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) elif m == "top-10": landmarks = wv_us.words[:int(len(wv_us.words) * 0.1)] elif m == "top-5": landmarks = wv_us.words[:int(len(wv_us.words) * 0.05)] elif m == "bot-10": landmarks = wv_us.words[-int(len(wv_us.words) * 0.1):] elif m == 'bot-5': landmarks = wv_us.words[-int(len(wv_us.words) * 0.05):] a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) wv1_ = WordVectors(words=wv1.words, vectors=np.dot(wv1.vectors, Q)) test_pairs = dico # print("Landmarks", len(landmarks)) # Train classifier self_scores = list() cos_scores = list() na_scores = list() iters = 100 # Interval to vary cosine thresholds cos_thresholds = [0.3, 0.5, 0.7] # Run several rounds, if given for r in range(args.rounds): model = s4(a_, b_, iters=iters, landmarks=landmarks, verbose=0, n_targets=1000, n_negatives=1000, rate=0.25, cls_model="nn", update_landmarks=False) acc = 0 acc_cos = 0 total = 0 y_pred = list() y_pred_cos = list() try: x = np.array( [np.concatenate((wv1_[p[0]], wv2[p[1]])) for p in test_pairs]) x_cos = np.array( [cosine(wv1_[p[0]], wv2[p[1]]) for p in test_pairs]) # Predict with noise-aware # Generate pairs (u, v) and apply noise-aware # 0 if pair is clean, 1 if pair is noisy v_a = np.array([wv1_[p[0]] for p in test_pairs]) v_b = np.array([wv2[p[1]] for p in test_pairs]) Q, alpha, clean, noisy = noise_aware(v_a, v_b) y_pred_na = np.zeros((len(test_pairs))) for i in noisy: y_pred_na[i] = 1 except KeyError as e: # skip word if not in model pass y_hat = model.predict(x) y_pred = (y_hat > 0.5) self_acc = accuracy_score(y_true, y_pred) self_prec = precision_score(y_true, y_pred) self_rec = recall_score(y_true, y_pred) self_f1 = f1_score(y_true, y_pred) self_scores.append([self_acc, self_prec, self_rec, self_f1]) # Cosine metrics # Compute average over multiple runs cos_acc = cos_prec = cos_rec = cos_f1 = 0 for t in cos_thresholds: y_pred_cos = (x_cos > t) cos_acc = round(accuracy_score(y_true, y_pred_cos), 2) cos_prec = round(precision_score(y_true, y_pred_cos), 2) cos_rec = round(recall_score(y_true, y_pred_cos), 2) cos_f1 = round(f1_score(y_true, y_pred_cos), 2) cos_scores.append([cos_acc, cos_prec, cos_rec, cos_f1]) # Noise-Aware metrics na_acc = round(accuracy_score(y_true, y_pred_na), 2) na_prec = round(precision_score(y_true, y_pred_na), 2) na_rec = round(recall_score(y_true, y_pred_na), 2) na_f1 = round(f1_score(y_true, y_pred_na), 2) na_scores.append([na_acc, na_prec, na_rec, na_f1]) self_scores = np.array(self_scores) cos_scores = np.array(cos_scores) na_scores = np.array(na_scores) # Print Markdown Table for j, t in enumerate(cos_thresholds): print("|COS %.2f" % t, m, sep="|", end="|") for i in range(4): print("%.2f" % (round(cos_scores[j:, i].mean(), 2)), end="|", sep=" ") print("|") print("|") print("|S4-D", m, end="|", sep="|") for i in range(4): print("%.2f +- %.2f" % (round(self_scores[:, i].mean(), 2), round(self_scores[:, i].std(), 2)), end="|", sep=" ") print("|") print("|Noisy-Pairs", "-", *na_scores[0], sep="|", end="|\n")