def inject_change_batch(wv, changes, alpha, replace=True): """ Given a WordVectors object and a list of words, perform fast injection of semantic change by using the update rule from Word2Vec wv - WordVectors (input) changes - list of n tuples (a, b) that drives the change such that b->a i.e.: simulates using b in the contexts of a alpha - degree in which to inject the change if scalar: apply same alpha to every pair if array-like: requires size n, specifies individual alpha values for each pair replace - (bool) if True, words are replaced instead of moved e.g.: if pair is (dog, car), then v_car <- v_dog Returns a WordVectors object with the change """ wv_new = WordVectors(words=wv.words, vectors=np.copy(wv.vectors)) for i, pair in enumerate(changes): t, w = pair t_i = wv.word_id[t] # target word w_i = wv.word_id[w] # modified word # Update vector with alpha and score # Higher score means vectors are already close, thus apply less change # Alpha controls the rate of change if not replace: b = wv_new[w] + alpha * (1) * wv[t] wv_new.vectors[w_i] = b else: wv_new.vectors[w_i] = wv[t] # print("norm b", np.linalg.norm(b)) return wv_new
def align(wv1, wv2, anchor_indices=None, anchor_words=None, anchor_top=None, anchor_bot=None, anchor_random=None, exclude={}, method="procrustes"): """ Implement OP alignment for a given set of landmarks. If no landmark is given, performs global alignment. Arguments: wv1 - WordVectors object to align to wv2 wv2 - Target WordVectors. Will align wv1 to it. anchor_indices - (optional) uses word indices as landmarks anchor_words - (optional) uses words as landmarks exclude - set of words to exclude from alignment method - Alignment objective. Currently only supports orthogonal procrustes. """ if anchor_top is not None: v1 = [ wv1.vectors[i] for i in range(anchor_top) if wv1.words[i] not in exclude ] v2 = [ wv2.vectors[i] for i in range(anchor_top) if wv2.words[i] not in exclude ] elif anchor_bot is not None: v1 = [ wv1.vectors[-i] for i in range(anchor_bot) if wv1.words[i] not in exclude ] v2 = [ wv2.vectors[-i] for i in range(anchor_bot) if wv2.words[i] not in exclude ] elif anchor_random is not None: anchors = np.random.choice(range(len(wv1.vectors)), anchor_random) v1 = [wv1.vectors[i] for i in anchors if wv1.words[i] not in exclude] v2 = [wv2.vectors[i] for i in anchors if wv2.words[i] not in exclude] elif anchor_indices is not None: v1 = [wv1.vectors[i] for i in indices if wv1.words[i] not in exclude] v2 = [wv2.vectors[i] for i in indices if wv2.words[i] not in exclude] elif anchor_words is not None: v1 = [wv1[w] for w in anchor_words if w not in exclude] v2 = [wv2[w] for w in anchor_words if w not in exclude] else: # just use all words v1 = [wv1[w] for w in wv1.words if w not in exclude] v2 = [wv2[w] for w in wv2.words if w not in exclude] v1 = np.array(v1) v2 = np.array(v2) if method == "procrustes": # align with OP Q, _ = orthogonal_procrustes(v1, v2) wv1_ = WordVectors(words=wv1.words, vectors=np.dot(wv1.vectors, Q)) return wv1_, wv2, Q
def main(): """ Runs main experiments using self supervised alignment. """ # wv_source = "wordvectors/latin/corpus1/0.vec" # wv_target = "wordvectors/latin/corpus2/0.vec" # wv_source = "wordvectors/source/theguardianuk.vec" # wv_target = "wordvectors/source/thenewyorktimes_1.vec" wv_source = "wordvectors/semeval/latin-corpus1.vec" wv_target = "wordvectors/semeval/latin-corpus2.vec" # wv_source = "wordvectors/usuk/bnc.vec" # wv_target = "wordvectors/usuk/coca_mag.vec" # wv_source = "wordvectors/artificial/NYT-0.vec" # wv_target = "wordvectors/artificial/NYT-500_random.vec" plt.style.use("seaborn") # Read WordVectors normalized = False wv1 = WordVectors(input_file=wv_source, normalized=normalized) wv2 = WordVectors(input_file=wv_target, normalized=normalized) wv1, wv2 = intersection(wv1, wv2) landmarks, non_landmarks, Q = s4(wv1, wv2, cls_model="nn", n_targets=100, n_negatives=100, rate=1, t=0.5, iters=100, verbose=1, plot=1) wv1, wv2, Q = align(wv1, wv2, anchor_words=landmarks) d_l = [cosine(wv1[w], wv2[w]) for w in landmarks] d_n = [cosine(wv1[w], wv2[w]) for w in non_landmarks] sns.distplot(d_l, color="blue") sns.distplot(d_n, color="red") plt.legend() plt.show()
def s4(wv1, wv2, verbose=0, plot=0, cls_model="nn", iters=100, n_targets=10, n_negatives=10, fast=True, rate=0, t=0.5, t_overlap=1, landmarks=None, update_landmarks=True, return_model=False, debug=False): """ Performs self-supervised learning of semantic change. Generates negative samples by sampling from landmarks. Generates positive samples via simulation of semantic change on random non-landmark words. Trains a classifier, fine-tune it across multiple iterations. If update_landmarks is True, then it learns landmarks from that step. In this case, the returned values are landmarks, non_landmarks, Q (transform matrix) Otherwise, landmarks are fixed from a starting set and the returned value is the learned classifier - landmarks must be passed. Arguments: wv1, wv2 - input WordVectors - required to be intersected before call verbose - 1: display log, 0: quiet plot - 1: plot functions in the end 0: do not plot cls_model - classification model to use {"nn", "svm_auto", "svm_features"} iters - max no. of iterations n_targets - number of positive samples to generate n_negatives - number of negative samples fast - use fast semantic change simulation rate - rate of semantic change injection t - classificaiton threshold (0.5) t_overlap - overlap threshold for (stop criterion) landmarks - list of words to use as landmarks (classification only) update_landmarks - if True, learns landmarks. Otherwise, learns classification model. debug - toggles debugging mode on/off. Provides reports on several metrics. Slower. Returns: if update_landmarks is True: landmarks - list of landmark words non_landmarks - list of non_landmark words Q - transformation matrix for procrustes alignment if update_landmarks is False: model - binary classifier """ # Define verbose prints if verbose == 1: def verbose_print(*s, end="\n"): print(*s, end=end) elif verbose == 0: def verbose_print(*s, end="\n"): return None wv2_original = WordVectors(words=wv2.words, vectors=wv2.vectors.copy()) avg_window = 0 # number of iterations to use in running average # Begin alignment if update_landmarks: # Check if landmarks is initialized if landmarks == None: wv1, wv2, Q = align(wv1, wv2) # start form global alignment landmark_dists = [ euclidean(u, v) for u, v in zip(wv1.vectors, wv2.vectors) ] landmark_args = np.argsort(landmark_dists) landmarks = [ wv1.words[i] for i in landmark_args[:int(len(wv1.words) * 0.5)] ] # landmarks = np.random.choice(wv1.words, int(len(wv1)*0.5)) landmark_set = set(landmarks) non_landmarks = np.array( [w for w in wv1.words if w not in landmark_set]) else: landmark_set = set(landmarks) non_landmarks = [w for w in wv1.words if w not in landmark_set] wv1, wv2, Q = align(wv1, wv2, anchor_words=landmarks) if cls_model == "nn": model = build_keras_model(wv1.dimension * 2) elif cls_model == "svm_auto" or cls_model == "svm_features": model = build_sklearn_model() # get SVC landmark_hist = list() # store no. of landmark history loss_hist = list() # store self-supervision loss history alignment_loss_hist = list() # store landmark alignment loss alignment_out_hist = list() # store alignment loss outside of lm alignment_all_hist = list() cumulative_out_hist = list() cumulative_alignment_hist = list() # store cumulative loss alignment overlap_hist = list() # store landmark overlap history cumulative_overlap_hist = list() # mean overlap history cumulative_loss = 0 # History of cosines cos_loss_in_hist = list() cos_loss_out_hist = list() cumulative_cos_in = list() cumulative_cos_out = list() prev_landmarks = set(landmarks) for iter in range(iters): replace = dict() # replacement dictionary pos_samples = list() pos_vectors = dict() # Randomly sample words to inject change to # If no word is flagged as non_landmarks, sample from all words # In practice, this should never occur when selecting landmarks # but only for classification when aligning on all words if len(non_landmarks) > 0: targets = np.random.choice(non_landmarks, n_targets) # Make targets deterministic #targets = non_landmarks else: targets = np.random.choice(wv1.words, n_targets) for target in targets: # Simulate semantic change in target word v = inject_change_single(wv2_original, target, wv1.words, wv1[target], rate) pos_vectors[target] = v pos_samples.append(target) # Convert to numpy array pos_samples = np.array(pos_samples) # Get negative samples from landmarks neg_samples = negative_samples(landmarks, n_negatives, p=None) neg_vectors = {w: wv2_original[w] for w in neg_samples} # Create dictionary of supervision samples (positive and negative) # Mapping word -> vector sup_vectors = {**neg_vectors, **pos_vectors} # Prepare training data words_train = np.concatenate((pos_samples, neg_samples)) # assign labels to positive and negative samples y_train = [1] * len(pos_samples) + [0] * len(neg_samples) # Stack columns to shuffle data and labels together train = np.column_stack((words_train, y_train)) # Shuffle batch np.random.shuffle(train) # Detach data and labels words_train = train[:, 0] y_train = train[:, -1].astype(int) x_train = np.array( [np.append(wv1[w], sup_vectors[w]) for w in words_train]) # Append history landmark_hist.append(len(landmarks)) v1_land = np.array([wv1[w] for w in landmarks]) v2_land = np.array([wv2_original[w] for w in landmarks]) v1_out = np.array([wv1[w] for w in non_landmarks]) v2_out = np.array([wv2_original[w] for w in non_landmarks]) alignment_loss = np.linalg.norm(v1_land - v2_land)**2 / len(v1_land) alignment_loss_hist.append(alignment_loss) cumulative_alignment_hist.append( np.mean(alignment_loss_hist[-avg_window:])) # out loss alignment_out_loss = np.linalg.norm(v1_out - v2_out)**2 / len(v1_out) alignment_out_hist.append(alignment_out_loss) cumulative_out_hist.append(np.mean(alignment_out_hist[-avg_window:])) # all loss alignment_all_loss = np.linalg.norm(wv1.vectors - wv2_original.vectors)**2 / len( wv1.words) alignment_all_hist.append(alignment_all_loss) if debug: # cosine loss cos_in = np.mean([cosine(u, v) for u, v in zip(v1_land, v2_land)]) cos_out = np.mean([cosine(u, v) for u, v in zip(v1_out, v2_out)]) cos_loss_in_hist.append(cos_in) cos_loss_out_hist.append(cos_out) cumulative_cos_in.append(np.mean(cos_loss_in_hist)) cumulative_cos_out.append(np.mean(cos_loss_out_hist)) # Begin training of neural network if cls_model == "nn": history = model.train_on_batch(x_train, y_train, reset_metrics=False) # history = model.fit(x_train, y_train, epochs=5, verbose=0) # history = [history.history["loss"][0]] elif cls_model == "svm_auto": model.fit(x_train, y_train) pred_train = model.predict_proba(x_train) history = [log_loss(y_train, pred_train)] elif cls_model == "svm_features": x_train_ = get_features(x_train) # retrieve manual features model.fit(x_train_, y_train) pred_train = model.predict_proba(x_train_) y_hat_t = (pred_train[:, 0] > 0.5) acc_t = accuracy_score(y_train, y_hat_t) history = [log_loss(y_train, pred_train), acc_t] loss_hist.append(history[0]) # Apply model on original data to select landmarks x_real = np.array([ np.append(u, v) for u, v in zip(wv1.vectors, wv2_original.vectors) ]) if cls_model == "nn": predict_real = model.predict(x_real) elif cls_model == "svm_auto": predict_real = model.predict_proba(x_real) predict_real = predict_real[:, 1] elif cls_model == "svm_features": x_real_ = get_features(x_real) predict_real = model.predict_proba(x_real_) predict_real = predict_real[:, 1] y_predict = (predict_real > t) if update_landmarks: landmarks = [ wv1.words[i] for i in range(len(wv1.words)) if predict_real[i] < t ] non_landmarks = [ wv1.words[i] for i in range(len(wv1.words)) if predict_real[i] > t ] # Update landmark overlap using Jaccard Index isect_ab = set.intersection(prev_landmarks, set(landmarks)) union_ab = set.union(prev_landmarks, set(landmarks)) j_index = len(isect_ab) / len(union_ab) overlap_hist.append(j_index) cumulative_overlap_hist.append(np.mean( overlap_hist[-avg_window:])) # store mean prev_landmarks = set(landmarks) verbose_print( "> %3d | L %4d | l(in): %.2f | l(out): %.2f | loss: %.2f | overlap %.2f | acc: %.2f" % (iter, len(landmarks), cumulative_alignment_hist[-1], cumulative_out_hist[-1], history[0], cumulative_overlap_hist[-1], history[1]), end="\r") wv1, wv2_original, Q = align(wv1, wv2_original, anchor_words=landmarks) # Check if overlap difference is below threhsold if np.mean(overlap_hist) > t_overlap: break # Print new line verbose_print() if plot == 1: iter += 1 # add one to iter for plotting plt.plot(range(iter), landmark_hist, label="landmarks") plt.hlines(len(wv1.words), 0, iter, colors="red") plt.ylabel("No. of landmarks") plt.xlabel("Iteration") plt.show() plt.plot(range(iter), loss_hist, c="red", label="loss") plt.ylabel("Loss (binary crossentropy)") plt.xlabel("Iteration") plt.legend() plt.show() plt.plot(range(iter), cumulative_alignment_hist, label="in (landmarks)") plt.plot(range(iter), cumulative_out_hist, label="out") plt.plot(range(iter), alignment_all_hist, label="all") plt.ylabel("Alignment loss (MSE)") plt.xlabel("Iteration") plt.legend() plt.show() if debug: plt.plot(range(iter), cumulative_cos_in, label="cos in") plt.plot(range(iter), cumulative_cos_out, label="cos out") plt.legend() plt.show() plt.plot(range(iter), cumulative_overlap_hist, label="overlap") plt.ylabel("Jaccard Index", fontsize=16) plt.xlabel("Iteration", fontsize=16) plt.xticks(fontsize=16) plt.yticks(fontsize=16) # plt.legend() plt.tight_layout() plt.savefig("overlap.pdf", format="pdf") #plt.show() if update_landmarks: if not return_model: return landmarks, non_landmarks, Q else: return landmarks, non_landmarks, Q, model else: return model
def threshold_crossvalidation(wv1, wv2, iters=100, n_fold=1, n_targets=100, n_negatives=100, fast=True, rate=0.5, t=0.5, landmarks=None, t_overlap=1, debug=False): """ Runs crossvalidation over self-supervised samples, carrying out a model selection to determine the best cosine threshold to use in the final prediction. Arguments: wv1, wv2 - input WordVectors - required to be intersected and ALIGNED before call plot - 1: plot functions in the end 0: do not plot iters - max no. of iterations n_fold - n-fold crossvalidation (1 - leave one out, 10 - 10-fold cv, etc.) n_targets - number of positive samples to generate n_negatives - number of negative samples fast - use fast semantic change simulation rate - rate of semantic change injection t - classificaiton threshold (0.5) t_overlap - overlap threshold for (stop criterion) landmarks - list of words to use as landmarks (classification only) debug - toggles debugging mode on/off. Provides reports on several metrics. Slower. Returns: t - selected cosine threshold t """ wv2_original = WordVectors(words=wv2.words, vectors=wv2.vectors.copy()) landmark_set = set(landmarks) non_landmarks = [w for w in wv1.words if w not in landmark_set] for iter in range(iters): replace = dict() # replacement dictionary pos_samples = list() pos_vectors = dict() # Randomly sample words to inject change to # If no word is flagged as non_landmarks, sample from all words # In practice, this should never occur when selecting landmarks # but only for classification when aligning on all words if len(non_landmarks) > 0: targets = np.random.choice(non_landmarks, n_targets) # Make targets deterministic #targets = non_landmarks else: targets = np.random.choice(wv1.words, n_targets) for target in targets: # Simulate semantic change in target word v = inject_change_single(wv2_original, target, wv1.words, wv1[target], rate) pos_vectors[target] = v pos_samples.append(target) # Convert to numpy array pos_samples = np.array(pos_samples) # Get negative samples from landmarks neg_samples = negative_samples(landmarks, n_negatives, p=None) neg_vectors = {w: wv2_original[w] for w in neg_samples} # Create dictionary of supervision samples (positive and negative) # Mapping word -> vector sup_vectors = {**neg_vectors, **pos_vectors} # Prepare training data words_train = np.concatenate((pos_samples, neg_samples)) # assign labels to positive and negative samples y_train = [1] * len(pos_samples) + [0] * len(neg_samples) # Stack columns to shuffle data and labels together train = np.column_stack((words_train, y_train)) # Shuffle batch np.random.shuffle(train) # Detach data and labels words_train = train[:, 0] y_train = train[:, -1].astype(int) # Calculate cosine distance of training samples x_train = np.array( [cosine(wv1[w], sup_vectors[w]) for w in words_train]) # t_pool = [0.2, 0.7] t_pool = np.arange(0.2, 1, 0.1) best_acc = 0 best_t = 0 for t_ in t_pool: acc = 0 for i in range(0, len(x_train), n_fold): x_cv = x_train[i:i + n_fold] y_true = y_train[i:i + n_fold] y_hat = x_cv > t_ acc += sum(y_hat == y_true) / len(x_cv) acc = acc / (len(x_train) // n_fold) if acc > best_acc: best_acc = acc best_t = t_ print("- New best t", t_, acc) return best_t
def main(): """ The following experiments are available: - Find most stable words in each ArXiv category (cs, math, cond-mat, physics) - Find most unstable (changed) words in earch category - Finds stable/unstable words across categories - Using different alignment strategies """ parser = argparse.ArgumentParser() parser.add_argument("cat1", type=str, help="Name of first arXiv category") parser.add_argument("cat2", type=str, help="Name of second arXiv category") args = parser.parse_args() cat1 = args.cat1 cat2 = args.cat2 cat1_name = cat1.split("/")[-1] cat2_name = cat2.split("/")[-1] # cat1_name = cat1.split("_")[2].rstrip(".vec") # cat2_name = cat2.split("_")[2].rstrip(".vec") path_out = "results/arxiv/" wva = WordVectors(input_file=cat1) wvb = WordVectors(input_file=cat2) wva, wvb = intersection(wva, wvb) wva, wvb, Q = align(wva, wvb) words = wva.words print("-- Common vocab", len(words)) # each column of this matrix will store a set of results for a method out_grid = np.zeros((len(words), 5)) d = distribution_of_change(wva, wvb) print("====== GLOBAL") print("=> landmarks", len(wva.words)) print_table(d, wva.words) out_grid[:, 0] = d # add first column print("====== Noise Aware") Q, alpha, landmarks, noisy = noise_aware(wva.vectors, wvb.vectors) wva, wvb, Q = align(wva, wvb, anchor_words=landmarks) print("=> landmarks", len(landmarks)) d = distribution_of_change(wva, wvb) print_table(d, wva.words) out_grid[:, 1] = d # add new column print("===== SELF") landmarks, nonl, Q = s4(wva, wvb, iters=100, verbose=1) wva, wvb, Q = align(wva, wvb, anchor_words=landmarks) d = distribution_of_change(wva, wvb) print_table(d, wva.words) out_grid[:, 2] = d # last column # WRITE-OUT with open(os.path.join(path_out, "%s-%s.csv" % (cat1_name, cat2_name)), "w") as fout: fout.write("word,global,noise-aware,self,top,bot\n") for i, w in enumerate(words): fout.write("%s,%.3f,%.3f,%.3f,%.3f,%.3f\n" % (w, out_grid[i][0], out_grid[i][1], out_grid[i][2], out_grid[i][3], out_grid[i][4]))
def main(): """ Performs tests on SemEval2020-Task 1 data on Unsupervised Lexical Semantic Change Detection. This experiments is designed to evaluate the performance of different landmark selection approaches, showing how the classification performance is affected by the landmark choices. """ np.random.seed(1) align_methods = [ "s4", "noise-aware", "top-10", "bot-10", "global", "top-5", "bot-5" ] parser = argparse.ArgumentParser() parser.add_argument("--languages", nargs="+", help="Languages to use", default=["english", "german", "latin", "swedish"]) parser.add_argument("--cls", choices=["cosine", "s4", "cosine-auto"], default="cosine", help="Classifier to use") args = parser.parse_args() languages = args.languages classifier = args.cls align_params = \ { "english" : { "n_targets": 100, "n_negatives": 50, "rate": 1, "iters": 100 }, "german" : { "n_targets": 100, "n_negatives": 200, "rate": 1, "iters": 100 }, "latin" : { "n_targets": 10, "n_negatives": 4, "rate": 0.5, "iters": 100 }, "swedish" : { "n_targets": 100, "n_negatives": 200, "rate": 1, "iters": 100 } } cls_params = \ { "english": { "n_targets": 100, "n_negatives": 50, "rate": 1, "iters": 500 }, "german":{ "n_targets": 50, "n_negatives": 200 }, "latin": { "n_targets": 50, "n_negatives": 10 }, "swedish": { "n_targets": 120, "n_negatives": 120 } } auto_params = \ { "english": { "rate": 1.5, "n_fold": 1, "n_targets": 50, "n_negatives": 100 }, "german": { "rate":1, "n_fold": 1, "n_targets": 200, "n_negatives": 100 }, "latin": { "rate": 1, "n_targets": 100, "n_negatives": 15 }, "swedish": { "rate": 1, "n_targets": 100, "n_negatives": 200 } } normalized = False accuracies = defaultdict(dict) true_positives = defaultdict(dict) false_negatives = defaultdict(dict) correct_ans = defaultdict(dict) cm = defaultdict(dict) for lang in languages: # print("---") # print(lang) t = 0.5 thresholds = np.arange(0.1, 1, 0.1) path_task1 = "data/semeval/truth/%s.txt" % lang path_task2 = "data/semeval/truth/%s.txt" % lang with open(path_task1) as fin: data = map(lambda s: s.strip().split("\t"), fin.readlines()) targets, true_class = zip(*data) y_true = np.array(true_class, dtype=int) with open(path_task2) as fin: data = map(lambda s: s.strip().split("\t"), fin.readlines()) _, true_ranking = zip(*data) true_ranking = np.array(true_ranking, dtype=float) corpus1_path = "wordvectors/semeval/%s-corpus1.vec" % lang corpus2_path = "wordvectors/semeval/%s-corpus2.vec" % lang wv1 = WordVectors(input_file=corpus1_path, normalized=normalized) wv2 = WordVectors(input_file=corpus2_path, normalized=normalized) c_method = defaultdict(list) wv1, wv2 = intersection(wv1, wv2) # print("Size of common vocab.", len(wv1)) prediction = dict() # store per-word prediction for align_method in align_methods: accuracies[align_method][lang] = list() true_positives[align_method][lang] = list() false_negatives[align_method][lang] = list() cm[align_method][lang] = np.zeros((2, 2)) if align_method == "global": landmarks = wv1.words elif align_method == "noise-aware": Q, alpha, landmarks, non_landmarks = noise_aware( wv1.vectors, wv2.vectors) landmarks = [wv1.words[i] for i in landmarks] elif align_method == "s4": landmarks, non_landmarks, Q = s4( wv1, wv2, cls_model="nn", verbose=0, **align_params[lang], ) elif align_method == "top-10": landmarks = wv1.words[int(len(wv1.words) * 0.1):] elif align_method == "top-5": landmarks = wv1.words[int(len(wv1.words) * 0.05):] elif align_method == "top-50": landmarks = wv1.words[int(len(wv1.words) * 0.50):] elif align_method == "bot-10": landmarks = wv1.words[-int(len(wv1.words) * 0.1):] elif align_method == "bot-5": landmarks = wv1.words[-int(len(wv1.words) * 0.05):] elif align_method == "bot-50": landmarks = wv1.words[-int(len(wv1.words) * 0.50):] wv1_, wv2_, Q = align(wv1, wv2, anchor_words=landmarks) # Cosine-based classifier if classifier == "cosine": x = np.array([cosine(wv1_[w], wv2_[w]) for w in wv1.words]) x = get_feature_cdf(x) x = np.array([x[wv1.word_id[i.lower()]] for i in targets]) p = x.reshape(-1, 1) r = vote(p) y_pred = r best_acc = 0 for t in thresholds: y_bin = (y_pred > t) correct = (y_bin == y_true) accuracy = accuracy_score(y_true, y_bin) if accuracy > best_acc: prediction[align_method] = correct best_acc = accuracy tn, fp, fn, tp = confusion_matrix(y_true, y_bin).ravel() cm[align_method][lang] += confusion_matrix(y_true, y_bin, normalize="all") accuracies[align_method][lang].append(round(accuracy, 2)) true_positives[align_method][lang].append(round(tp, 2)) false_negatives[align_method][lang].append(round(fn, 2)) elif classifier == "cosine-auto": t_cos = threshold_crossvalidation(wv1_, wv2_, iters=1, **auto_params[lang], landmarks=landmarks) x = np.array([cosine(wv1_[w], wv2_[w]) for w in wv1.words]) x = get_feature_cdf(x) x = np.array([x[wv1.word_id[i.lower()]] for i in targets]) p = x.reshape(-1, 1) r = vote(p) y_pred = r y_bin = y_pred > t_cos correct = (y_bin == y_true) accuracy = accuracy_score(y_true, y_bin) accuracies[align_method][lang].append(round(accuracy, 2)) elif classifier == "s4": model = s4(wv1_, wv2_, landmarks=landmarks, verbose=0, **cls_params[lang], update_landmarks=False) # Concatenate vectors of target words for prediction x = np.array([ np.concatenate((wv1_[t.lower()], wv2_[t.lower()])) for t in targets ]) y_pred = model.predict(x) y_bin = y_pred > 0.5 correct = (y_bin == y_true) accuracy = accuracy_score(y_true, y_bin) print(accuracy) accuracies[align_method][lang].append(round(accuracy, 2)) c_method[align_method] = y_pred rho, pvalue = spearmanr(true_ranking, y_pred) # print(lang, align_method, "acc", accuracies[align_method][lang], # "\nranking", round(rho, 2), # "landmarks", len(landmarks)) print("|Method|Language|Mean acc.|Max acc.|") print("|------|--------|---------|--------|") for method in accuracies: print("|", method, end="|") for lang in accuracies[method]: print(lang, round(np.mean(accuracies[method][lang]), 2), np.max(accuracies[method][lang]), sep="|", end="|\n") print()
def main(): parser = argparse.ArgumentParser() parser.add_argument("alignment", choices=[ 'top-5', 'top-10', 'noise-aware', 'bot-5', 'bot-10', 'global', 's4' ], default="top", help="Method to use in the alignment of UK to US") parser.add_argument("--rounds", type=int, default=1, help="No. of rounds to run the classifications") args = parser.parse_args() path_us = "wordvectors/ukus/coca.vec" path_uk = "wordvectors/ukus/bnc.vec" path_dict = "data/ukus/dict_similar.txt" path_dict_dis = "data/ukus/dict_dissimilar.txt" normalized = False wv1 = WordVectors(input_file=path_uk, normalized=normalized) wv2 = WordVectors(input_file=path_us, normalized=normalized) wv_uk, wv_us = intersection(wv1, wv2) # Load dictionaries of words with open(path_dict) as fin: dico_sim = list(map(lambda s: s.strip().split(" ", 1), fin.readlines())) with open(path_dict_dis) as fin: dico_dis = list(map(lambda s: (s.strip(), s.strip()), fin.readlines())) # Filter words not in the vocabulry of either UK or US corpora dico_sim = [(a, b) for a, b in dico_sim if a in wv_uk.word_id and b in wv_us.word_id] dico_dis = [(a, b) for a, b in dico_dis if a in wv_uk.word_id and b in wv_us.word_id] dico = dico_sim + dico_dis # Create true labels for terms # 0 -> similar | 1 -> dissimilar y_true = [0] * len(dico_sim) + [1] * len(dico_dis) m = args.alignment # Align wordvectors (using any alignment approach) if m == "noise-aware": Q, alpha, landmarks, noise = noise_aware(wv_uk.vectors, wv_us.vectors) landmarks = [wv_uk.words[i] for i in landmarks] a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) elif m == "global": landmarks = wv_us.words a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) landmarks = landmarks[:len(landmarks) // 2] elif m == "s4": landmarks = wv_us.words a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) landmarks, non_landmarks, Q = s4( wv_uk, wv_us, cls_model="nn", verbose=0, iters=100, n_targets=100, n_negatives=10, rate=0.25, ) a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) elif m == "top-10": landmarks = wv_us.words[:int(len(wv_us.words) * 0.1)] elif m == "top-5": landmarks = wv_us.words[:int(len(wv_us.words) * 0.05)] elif m == "bot-10": landmarks = wv_us.words[-int(len(wv_us.words) * 0.1):] elif m == 'bot-5': landmarks = wv_us.words[-int(len(wv_us.words) * 0.05):] a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks) wv1_ = WordVectors(words=wv1.words, vectors=np.dot(wv1.vectors, Q)) test_pairs = dico # print("Landmarks", len(landmarks)) # Train classifier self_scores = list() cos_scores = list() na_scores = list() iters = 100 # Interval to vary cosine thresholds cos_thresholds = [0.3, 0.5, 0.7] # Run several rounds, if given for r in range(args.rounds): model = s4(a_, b_, iters=iters, landmarks=landmarks, verbose=0, n_targets=1000, n_negatives=1000, rate=0.25, cls_model="nn", update_landmarks=False) acc = 0 acc_cos = 0 total = 0 y_pred = list() y_pred_cos = list() try: x = np.array( [np.concatenate((wv1_[p[0]], wv2[p[1]])) for p in test_pairs]) x_cos = np.array( [cosine(wv1_[p[0]], wv2[p[1]]) for p in test_pairs]) # Predict with noise-aware # Generate pairs (u, v) and apply noise-aware # 0 if pair is clean, 1 if pair is noisy v_a = np.array([wv1_[p[0]] for p in test_pairs]) v_b = np.array([wv2[p[1]] for p in test_pairs]) Q, alpha, clean, noisy = noise_aware(v_a, v_b) y_pred_na = np.zeros((len(test_pairs))) for i in noisy: y_pred_na[i] = 1 except KeyError as e: # skip word if not in model pass y_hat = model.predict(x) y_pred = (y_hat > 0.5) self_acc = accuracy_score(y_true, y_pred) self_prec = precision_score(y_true, y_pred) self_rec = recall_score(y_true, y_pred) self_f1 = f1_score(y_true, y_pred) self_scores.append([self_acc, self_prec, self_rec, self_f1]) # Cosine metrics # Compute average over multiple runs cos_acc = cos_prec = cos_rec = cos_f1 = 0 for t in cos_thresholds: y_pred_cos = (x_cos > t) cos_acc = round(accuracy_score(y_true, y_pred_cos), 2) cos_prec = round(precision_score(y_true, y_pred_cos), 2) cos_rec = round(recall_score(y_true, y_pred_cos), 2) cos_f1 = round(f1_score(y_true, y_pred_cos), 2) cos_scores.append([cos_acc, cos_prec, cos_rec, cos_f1]) # Noise-Aware metrics na_acc = round(accuracy_score(y_true, y_pred_na), 2) na_prec = round(precision_score(y_true, y_pred_na), 2) na_rec = round(recall_score(y_true, y_pred_na), 2) na_f1 = round(f1_score(y_true, y_pred_na), 2) na_scores.append([na_acc, na_prec, na_rec, na_f1]) self_scores = np.array(self_scores) cos_scores = np.array(cos_scores) na_scores = np.array(na_scores) # Print Markdown Table for j, t in enumerate(cos_thresholds): print("|COS %.2f" % t, m, sep="|", end="|") for i in range(4): print("%.2f" % (round(cos_scores[j:, i].mean(), 2)), end="|", sep=" ") print("|") print("|") print("|S4-D", m, end="|", sep="|") for i in range(4): print("%.2f +- %.2f" % (round(self_scores[:, i].mean(), 2), round(self_scores[:, i].std(), 2)), end="|", sep=" ") print("|") print("|Noisy-Pairs", "-", *na_scores[0], sep="|", end="|\n")