Пример #1
0
def inject_change_batch(wv, changes, alpha, replace=True):
    """
    Given a WordVectors object and a list of words, perform fast injection
    of semantic change by using the update rule from Word2Vec
    wv - WordVectors (input)
    changes - list of n tuples (a, b) that drives the change such that b->a
          i.e.: simulates using b in the contexts of a
    alpha - degree in which to inject the change
              if scalar: apply same alpha to every pair
              if array-like: requires size n, specifies individual alpha values
                              for each pair
    replace  - (bool) if True, words are replaced instead of moved
                e.g.: if pair is (dog, car), then v_car <- v_dog
    Returns a WordVectors object with the change
    """
    wv_new = WordVectors(words=wv.words, vectors=np.copy(wv.vectors))
    for i, pair in enumerate(changes):
        t, w = pair
        t_i = wv.word_id[t]  # target word
        w_i = wv.word_id[w]  # modified word
        # Update vector with alpha and score
        # Higher score means vectors are already close, thus apply less change
        # Alpha controls the rate of change
        if not replace:
            b = wv_new[w] + alpha * (1) * wv[t]
            wv_new.vectors[w_i] = b
        else:
            wv_new.vectors[w_i] = wv[t]
        # print("norm b", np.linalg.norm(b))
    return wv_new
Пример #2
0
def align(wv1,
          wv2,
          anchor_indices=None,
          anchor_words=None,
          anchor_top=None,
          anchor_bot=None,
          anchor_random=None,
          exclude={},
          method="procrustes"):
    """
    Implement OP alignment for a given set of landmarks.
    If no landmark is given, performs global alignment.
    Arguments:
        wv1 - WordVectors object to align to wv2
        wv2 - Target WordVectors. Will align wv1 to it.
        anchor_indices - (optional) uses word indices as landmarks
        anchor_words - (optional) uses words as landmarks
        exclude - set of words to exclude from alignment
        method - Alignment objective. Currently only supports orthogonal procrustes.
    """
    if anchor_top is not None:
        v1 = [
            wv1.vectors[i] for i in range(anchor_top)
            if wv1.words[i] not in exclude
        ]
        v2 = [
            wv2.vectors[i] for i in range(anchor_top)
            if wv2.words[i] not in exclude
        ]
    elif anchor_bot is not None:
        v1 = [
            wv1.vectors[-i] for i in range(anchor_bot)
            if wv1.words[i] not in exclude
        ]
        v2 = [
            wv2.vectors[-i] for i in range(anchor_bot)
            if wv2.words[i] not in exclude
        ]
    elif anchor_random is not None:
        anchors = np.random.choice(range(len(wv1.vectors)), anchor_random)
        v1 = [wv1.vectors[i] for i in anchors if wv1.words[i] not in exclude]
        v2 = [wv2.vectors[i] for i in anchors if wv2.words[i] not in exclude]
    elif anchor_indices is not None:
        v1 = [wv1.vectors[i] for i in indices if wv1.words[i] not in exclude]
        v2 = [wv2.vectors[i] for i in indices if wv2.words[i] not in exclude]
    elif anchor_words is not None:
        v1 = [wv1[w] for w in anchor_words if w not in exclude]
        v2 = [wv2[w] for w in anchor_words if w not in exclude]
    else:  # just use all words
        v1 = [wv1[w] for w in wv1.words if w not in exclude]
        v2 = [wv2[w] for w in wv2.words if w not in exclude]
    v1 = np.array(v1)
    v2 = np.array(v2)
    if method == "procrustes":  # align with OP
        Q, _ = orthogonal_procrustes(v1, v2)

    wv1_ = WordVectors(words=wv1.words, vectors=np.dot(wv1.vectors, Q))

    return wv1_, wv2, Q
Пример #3
0
def main():
    """
    Runs main experiments using self supervised alignment.
    """
    # wv_source = "wordvectors/latin/corpus1/0.vec"
    # wv_target = "wordvectors/latin/corpus2/0.vec"
    # wv_source = "wordvectors/source/theguardianuk.vec"
    # wv_target = "wordvectors/source/thenewyorktimes_1.vec"
    wv_source = "wordvectors/semeval/latin-corpus1.vec"
    wv_target = "wordvectors/semeval/latin-corpus2.vec"
    # wv_source = "wordvectors/usuk/bnc.vec"
    # wv_target = "wordvectors/usuk/coca_mag.vec"
    # wv_source = "wordvectors/artificial/NYT-0.vec"
    # wv_target = "wordvectors/artificial/NYT-500_random.vec"
    plt.style.use("seaborn")

    # Read WordVectors
    normalized = False
    wv1 = WordVectors(input_file=wv_source, normalized=normalized)
    wv2 = WordVectors(input_file=wv_target, normalized=normalized)

    wv1, wv2 = intersection(wv1, wv2)

    landmarks, non_landmarks, Q = s4(wv1,
                                     wv2,
                                     cls_model="nn",
                                     n_targets=100,
                                     n_negatives=100,
                                     rate=1,
                                     t=0.5,
                                     iters=100,
                                     verbose=1,
                                     plot=1)
    wv1, wv2, Q = align(wv1, wv2, anchor_words=landmarks)
    d_l = [cosine(wv1[w], wv2[w]) for w in landmarks]
    d_n = [cosine(wv1[w], wv2[w]) for w in non_landmarks]
    sns.distplot(d_l, color="blue")
    sns.distplot(d_n, color="red")
    plt.legend()
    plt.show()
Пример #4
0
def s4(wv1,
       wv2,
       verbose=0,
       plot=0,
       cls_model="nn",
       iters=100,
       n_targets=10,
       n_negatives=10,
       fast=True,
       rate=0,
       t=0.5,
       t_overlap=1,
       landmarks=None,
       update_landmarks=True,
       return_model=False,
       debug=False):
    """
    Performs self-supervised learning of semantic change.
    Generates negative samples by sampling from landmarks.
    Generates positive samples via simulation of semantic change on random non-landmark words.
    Trains a classifier, fine-tune it across multiple iterations.
    If update_landmarks is True, then it learns landmarks from that step. In this case,
    the returned values are landmarks, non_landmarks, Q (transform matrix)
    Otherwise, landmarks are fixed from a starting set and the returned value
    is the learned classifier - landmarks must be passed.
    Arguments:
        wv1, wv2    - input WordVectors - required to be intersected before call
        verbose     - 1: display log, 0: quiet
        plot        - 1: plot functions in the end 0: do not plot
        cls_model   - classification model to use {"nn", "svm_auto", "svm_features"}
        iters       - max no. of iterations
        n_targets   - number of positive samples to generate
        n_negatives - number of negative samples
        fast        - use fast semantic change simulation
        rate        - rate of semantic change injection
        t           - classificaiton threshold (0.5)
        t_overlap   - overlap threshold for (stop criterion)
        landmarks   - list of words to use as landmarks (classification only)
        update_landmarks - if True, learns landmarks. Otherwise, learns classification model.
        debug       - toggles debugging mode on/off. Provides reports on several metrics. Slower.
    Returns:
        if update_landmarks is True:
            landmarks - list of landmark words
            non_landmarks - list of non_landmark words
            Q           - transformation matrix for procrustes alignment
        if update_landmarks is False:
            model       - binary classifier
    """

    # Define verbose prints
    if verbose == 1:

        def verbose_print(*s, end="\n"):
            print(*s, end=end)
    elif verbose == 0:

        def verbose_print(*s, end="\n"):
            return None

    wv2_original = WordVectors(words=wv2.words, vectors=wv2.vectors.copy())

    avg_window = 0  # number of iterations to use in running average

    # Begin alignment
    if update_landmarks:
        # Check if landmarks is initialized
        if landmarks == None:
            wv1, wv2, Q = align(wv1, wv2)  # start form global alignment
            landmark_dists = [
                euclidean(u, v) for u, v in zip(wv1.vectors, wv2.vectors)
            ]
            landmark_args = np.argsort(landmark_dists)
            landmarks = [
                wv1.words[i] for i in landmark_args[:int(len(wv1.words) * 0.5)]
            ]
            # landmarks = np.random.choice(wv1.words, int(len(wv1)*0.5))
        landmark_set = set(landmarks)
        non_landmarks = np.array(
            [w for w in wv1.words if w not in landmark_set])
    else:
        landmark_set = set(landmarks)
        non_landmarks = [w for w in wv1.words if w not in landmark_set]

    wv1, wv2, Q = align(wv1, wv2, anchor_words=landmarks)

    if cls_model == "nn":
        model = build_keras_model(wv1.dimension * 2)
    elif cls_model == "svm_auto" or cls_model == "svm_features":
        model = build_sklearn_model()  # get SVC

    landmark_hist = list()  # store no. of landmark history
    loss_hist = list()  # store self-supervision loss history
    alignment_loss_hist = list()  # store landmark alignment loss
    alignment_out_hist = list()  # store alignment loss outside of lm
    alignment_all_hist = list()

    cumulative_out_hist = list()
    cumulative_alignment_hist = list()  # store cumulative loss alignment
    overlap_hist = list()  # store landmark overlap history
    cumulative_overlap_hist = list()  # mean overlap history
    cumulative_loss = 0

    # History of cosines
    cos_loss_in_hist = list()
    cos_loss_out_hist = list()
    cumulative_cos_in = list()
    cumulative_cos_out = list()

    prev_landmarks = set(landmarks)
    for iter in range(iters):

        replace = dict()  # replacement dictionary
        pos_samples = list()
        pos_vectors = dict()

        # Randomly sample words to inject change to
        # If no word is flagged as non_landmarks, sample from all words
        # In practice, this should never occur when selecting landmarks
        # but only for classification when aligning on all words
        if len(non_landmarks) > 0:
            targets = np.random.choice(non_landmarks, n_targets)
            # Make targets deterministic
            #targets = non_landmarks
        else:
            targets = np.random.choice(wv1.words, n_targets)

        for target in targets:

            # Simulate semantic change in target word
            v = inject_change_single(wv2_original, target, wv1.words,
                                     wv1[target], rate)

            pos_vectors[target] = v

            pos_samples.append(target)
        # Convert to numpy array
        pos_samples = np.array(pos_samples)
        # Get negative samples from landmarks
        neg_samples = negative_samples(landmarks, n_negatives, p=None)
        neg_vectors = {w: wv2_original[w] for w in neg_samples}
        # Create dictionary of supervision samples (positive and negative)
        # Mapping word -> vector
        sup_vectors = {**neg_vectors, **pos_vectors}

        # Prepare training data
        words_train = np.concatenate((pos_samples, neg_samples))
        # assign labels to positive and negative samples
        y_train = [1] * len(pos_samples) + [0] * len(neg_samples)

        # Stack columns to shuffle data and labels together
        train = np.column_stack((words_train, y_train))
        # Shuffle batch
        np.random.shuffle(train)
        # Detach data and labels
        words_train = train[:, 0]
        y_train = train[:, -1].astype(int)

        x_train = np.array(
            [np.append(wv1[w], sup_vectors[w]) for w in words_train])

        # Append history
        landmark_hist.append(len(landmarks))
        v1_land = np.array([wv1[w] for w in landmarks])
        v2_land = np.array([wv2_original[w] for w in landmarks])
        v1_out = np.array([wv1[w] for w in non_landmarks])
        v2_out = np.array([wv2_original[w] for w in non_landmarks])

        alignment_loss = np.linalg.norm(v1_land - v2_land)**2 / len(v1_land)
        alignment_loss_hist.append(alignment_loss)
        cumulative_alignment_hist.append(
            np.mean(alignment_loss_hist[-avg_window:]))

        # out loss
        alignment_out_loss = np.linalg.norm(v1_out - v2_out)**2 / len(v1_out)
        alignment_out_hist.append(alignment_out_loss)
        cumulative_out_hist.append(np.mean(alignment_out_hist[-avg_window:]))

        # all loss
        alignment_all_loss = np.linalg.norm(wv1.vectors -
                                            wv2_original.vectors)**2 / len(
                                                wv1.words)
        alignment_all_hist.append(alignment_all_loss)

        if debug:
            # cosine loss
            cos_in = np.mean([cosine(u, v) for u, v in zip(v1_land, v2_land)])
            cos_out = np.mean([cosine(u, v) for u, v in zip(v1_out, v2_out)])
            cos_loss_in_hist.append(cos_in)
            cos_loss_out_hist.append(cos_out)
            cumulative_cos_in.append(np.mean(cos_loss_in_hist))
            cumulative_cos_out.append(np.mean(cos_loss_out_hist))

        # Begin training of neural network
        if cls_model == "nn":
            history = model.train_on_batch(x_train,
                                           y_train,
                                           reset_metrics=False)
            # history = model.fit(x_train, y_train, epochs=5, verbose=0)
            # history = [history.history["loss"][0]]
        elif cls_model == "svm_auto":
            model.fit(x_train, y_train)
            pred_train = model.predict_proba(x_train)
            history = [log_loss(y_train, pred_train)]
        elif cls_model == "svm_features":
            x_train_ = get_features(x_train)  # retrieve manual features
            model.fit(x_train_, y_train)
            pred_train = model.predict_proba(x_train_)
            y_hat_t = (pred_train[:, 0] > 0.5)
            acc_t = accuracy_score(y_train, y_hat_t)
            history = [log_loss(y_train, pred_train), acc_t]

        loss_hist.append(history[0])

        # Apply model on original data to select landmarks
        x_real = np.array([
            np.append(u, v) for u, v in zip(wv1.vectors, wv2_original.vectors)
        ])
        if cls_model == "nn":
            predict_real = model.predict(x_real)
        elif cls_model == "svm_auto":
            predict_real = model.predict_proba(x_real)
            predict_real = predict_real[:, 1]
        elif cls_model == "svm_features":
            x_real_ = get_features(x_real)
            predict_real = model.predict_proba(x_real_)
            predict_real = predict_real[:, 1]

        y_predict = (predict_real > t)

        if update_landmarks:
            landmarks = [
                wv1.words[i] for i in range(len(wv1.words))
                if predict_real[i] < t
            ]
            non_landmarks = [
                wv1.words[i] for i in range(len(wv1.words))
                if predict_real[i] > t
            ]

        # Update landmark overlap using Jaccard Index
        isect_ab = set.intersection(prev_landmarks, set(landmarks))
        union_ab = set.union(prev_landmarks, set(landmarks))
        j_index = len(isect_ab) / len(union_ab)
        overlap_hist.append(j_index)

        cumulative_overlap_hist.append(np.mean(
            overlap_hist[-avg_window:]))  # store mean

        prev_landmarks = set(landmarks)

        verbose_print(
            "> %3d | L %4d | l(in): %.2f | l(out): %.2f | loss: %.2f | overlap %.2f | acc: %.2f"
            % (iter, len(landmarks), cumulative_alignment_hist[-1],
               cumulative_out_hist[-1], history[0],
               cumulative_overlap_hist[-1], history[1]),
            end="\r")

        wv1, wv2_original, Q = align(wv1, wv2_original, anchor_words=landmarks)

        # Check if overlap difference is below threhsold
        if np.mean(overlap_hist) > t_overlap:
            break

    # Print new line
    verbose_print()

    if plot == 1:
        iter += 1  # add one to iter for plotting
        plt.plot(range(iter), landmark_hist, label="landmarks")
        plt.hlines(len(wv1.words), 0, iter, colors="red")
        plt.ylabel("No. of landmarks")
        plt.xlabel("Iteration")
        plt.show()
        plt.plot(range(iter), loss_hist, c="red", label="loss")
        plt.ylabel("Loss (binary crossentropy)")
        plt.xlabel("Iteration")
        plt.legend()
        plt.show()
        plt.plot(range(iter),
                 cumulative_alignment_hist,
                 label="in (landmarks)")
        plt.plot(range(iter), cumulative_out_hist, label="out")
        plt.plot(range(iter), alignment_all_hist, label="all")
        plt.ylabel("Alignment loss (MSE)")
        plt.xlabel("Iteration")
        plt.legend()
        plt.show()

        if debug:
            plt.plot(range(iter), cumulative_cos_in, label="cos in")
            plt.plot(range(iter), cumulative_cos_out, label="cos out")
            plt.legend()
            plt.show()

        plt.plot(range(iter), cumulative_overlap_hist, label="overlap")

        plt.ylabel("Jaccard Index", fontsize=16)
        plt.xlabel("Iteration", fontsize=16)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        # plt.legend()
        plt.tight_layout()
        plt.savefig("overlap.pdf", format="pdf")
        #plt.show()

    if update_landmarks:
        if not return_model:
            return landmarks, non_landmarks, Q
        else:
            return landmarks, non_landmarks, Q, model
    else:
        return model
Пример #5
0
def threshold_crossvalidation(wv1,
                              wv2,
                              iters=100,
                              n_fold=1,
                              n_targets=100,
                              n_negatives=100,
                              fast=True,
                              rate=0.5,
                              t=0.5,
                              landmarks=None,
                              t_overlap=1,
                              debug=False):
    """
    Runs crossvalidation over self-supervised samples, carrying out a model
    selection to determine the best cosine threshold to use in the final
    prediction.

    Arguments:
        wv1, wv2    - input WordVectors - required to be intersected and ALIGNED before call
        plot        - 1: plot functions in the end 0: do not plot
        iters       - max no. of iterations
        n_fold      - n-fold crossvalidation (1 - leave one out, 10 - 10-fold cv, etc.)
        n_targets   - number of positive samples to generate
        n_negatives - number of negative samples
        fast        - use fast semantic change simulation
        rate        - rate of semantic change injection
        t           - classificaiton threshold (0.5)
        t_overlap   - overlap threshold for (stop criterion)
        landmarks   - list of words to use as landmarks (classification only)
        debug       - toggles debugging mode on/off. Provides reports on several metrics. Slower.
    Returns:
        t - selected cosine threshold t
    """

    wv2_original = WordVectors(words=wv2.words, vectors=wv2.vectors.copy())
    landmark_set = set(landmarks)
    non_landmarks = [w for w in wv1.words if w not in landmark_set]

    for iter in range(iters):

        replace = dict()  # replacement dictionary
        pos_samples = list()
        pos_vectors = dict()

        # Randomly sample words to inject change to
        # If no word is flagged as non_landmarks, sample from all words
        # In practice, this should never occur when selecting landmarks
        # but only for classification when aligning on all words
        if len(non_landmarks) > 0:
            targets = np.random.choice(non_landmarks, n_targets)
            # Make targets deterministic
            #targets = non_landmarks
        else:
            targets = np.random.choice(wv1.words, n_targets)

        for target in targets:

            # Simulate semantic change in target word
            v = inject_change_single(wv2_original, target, wv1.words,
                                     wv1[target], rate)

            pos_vectors[target] = v

            pos_samples.append(target)
        # Convert to numpy array
        pos_samples = np.array(pos_samples)
        # Get negative samples from landmarks
        neg_samples = negative_samples(landmarks, n_negatives, p=None)
        neg_vectors = {w: wv2_original[w] for w in neg_samples}
        # Create dictionary of supervision samples (positive and negative)
        # Mapping word -> vector
        sup_vectors = {**neg_vectors, **pos_vectors}

        # Prepare training data
        words_train = np.concatenate((pos_samples, neg_samples))
        # assign labels to positive and negative samples
        y_train = [1] * len(pos_samples) + [0] * len(neg_samples)

        # Stack columns to shuffle data and labels together
        train = np.column_stack((words_train, y_train))
        # Shuffle batch
        np.random.shuffle(train)
        # Detach data and labels
        words_train = train[:, 0]
        y_train = train[:, -1].astype(int)

        # Calculate cosine distance of training samples
        x_train = np.array(
            [cosine(wv1[w], sup_vectors[w]) for w in words_train])

        # t_pool = [0.2, 0.7]
        t_pool = np.arange(0.2, 1, 0.1)

        best_acc = 0
        best_t = 0
        for t_ in t_pool:
            acc = 0
            for i in range(0, len(x_train), n_fold):
                x_cv = x_train[i:i + n_fold]
                y_true = y_train[i:i + n_fold]
                y_hat = x_cv > t_
                acc += sum(y_hat == y_true) / len(x_cv)
            acc = acc / (len(x_train) // n_fold)
            if acc > best_acc:
                best_acc = acc
                best_t = t_
                print("- New best t", t_, acc)

    return best_t
Пример #6
0
def main():
    """
    The following experiments are available:
        - Find most stable words in each ArXiv category (cs, math, cond-mat, physics)
        - Find most unstable (changed) words in earch category
        - Finds stable/unstable words across categories
        - Using different alignment strategies
    """

    parser = argparse.ArgumentParser()
    parser.add_argument("cat1", type=str, help="Name of first arXiv category")
    parser.add_argument("cat2", type=str, help="Name of second arXiv category")

    args = parser.parse_args()

    cat1 = args.cat1
    cat2 = args.cat2

    cat1_name = cat1.split("/")[-1]
    cat2_name = cat2.split("/")[-1]

    # cat1_name = cat1.split("_")[2].rstrip(".vec")
    # cat2_name = cat2.split("_")[2].rstrip(".vec")

    path_out = "results/arxiv/"

    wva = WordVectors(input_file=cat1)
    wvb = WordVectors(input_file=cat2)
    wva, wvb = intersection(wva, wvb)
    wva, wvb, Q = align(wva, wvb)
    words = wva.words

    print("-- Common vocab", len(words))
    # each column of this matrix will store a set of results for a method
    out_grid = np.zeros((len(words), 5))

    d = distribution_of_change(wva, wvb)
    print("====== GLOBAL")
    print("=> landmarks", len(wva.words))
    print_table(d, wva.words)
    out_grid[:, 0] = d  # add first column

    print("====== Noise Aware")

    Q, alpha, landmarks, noisy = noise_aware(wva.vectors, wvb.vectors)
    wva, wvb, Q = align(wva, wvb, anchor_words=landmarks)
    print("=> landmarks", len(landmarks))
    d = distribution_of_change(wva, wvb)
    print_table(d, wva.words)
    out_grid[:, 1] = d  # add new column

    print("===== SELF")
    landmarks, nonl, Q = s4(wva, wvb, iters=100, verbose=1)
    wva, wvb, Q = align(wva, wvb, anchor_words=landmarks)
    d = distribution_of_change(wva, wvb)
    print_table(d, wva.words)
    out_grid[:, 2] = d  # last column

    # WRITE-OUT
    with open(os.path.join(path_out, "%s-%s.csv" % (cat1_name, cat2_name)),
              "w") as fout:
        fout.write("word,global,noise-aware,self,top,bot\n")
        for i, w in enumerate(words):
            fout.write("%s,%.3f,%.3f,%.3f,%.3f,%.3f\n" %
                       (w, out_grid[i][0], out_grid[i][1], out_grid[i][2],
                        out_grid[i][3], out_grid[i][4]))
Пример #7
0
def main():
    """
    Performs tests on SemEval2020-Task 1 data on Unsupervised Lexical Semantic Change Detection.
    This experiments is designed to evaluate the performance of different landmark selection approaches,
    showing how the classification performance is affected by the landmark choices.
    """
    np.random.seed(1)

    align_methods = [
        "s4", "noise-aware", "top-10", "bot-10", "global", "top-5", "bot-5"
    ]

    parser = argparse.ArgumentParser()
    parser.add_argument("--languages",
                        nargs="+",
                        help="Languages to use",
                        default=["english", "german", "latin", "swedish"])
    parser.add_argument("--cls",
                        choices=["cosine", "s4", "cosine-auto"],
                        default="cosine",
                        help="Classifier to use")

    args = parser.parse_args()
    languages = args.languages
    classifier = args.cls

    align_params = \
    {
        "english" : {
            "n_targets": 100,
            "n_negatives": 50,
            "rate": 1,
            "iters": 100
        },
        "german" : {
            "n_targets": 100,
            "n_negatives": 200,
            "rate": 1,
            "iters": 100
        },
        "latin" : {
            "n_targets": 10,
            "n_negatives": 4,
            "rate": 0.5,
            "iters": 100
        },
        "swedish" : {
            "n_targets": 100,
            "n_negatives": 200,
            "rate": 1,
            "iters": 100
        }
    }

    cls_params = \
    {
        "english": {
            "n_targets": 100,
            "n_negatives": 50,
            "rate": 1,
            "iters": 500
        },
        "german":{
            "n_targets": 50,
            "n_negatives": 200
        },
        "latin":
        {
            "n_targets": 50,
            "n_negatives": 10
        },
        "swedish":
        {
            "n_targets": 120,
            "n_negatives": 120
        }
    }

    auto_params = \
    {
        "english":
            {
            "rate": 1.5,
            "n_fold": 1,
            "n_targets": 50,
            "n_negatives": 100
            },
        "german":
        {
            "rate":1,
            "n_fold": 1,
            "n_targets": 200,
            "n_negatives": 100
        },
        "latin":
        {
            "rate": 1,
            "n_targets": 100,
            "n_negatives": 15
        },
        "swedish":
        {
            "rate": 1,
            "n_targets": 100,
            "n_negatives": 200
        }
    }

    normalized = False
    accuracies = defaultdict(dict)
    true_positives = defaultdict(dict)
    false_negatives = defaultdict(dict)
    correct_ans = defaultdict(dict)
    cm = defaultdict(dict)
    for lang in languages:
        # print("---")
        # print(lang)
        t = 0.5
        thresholds = np.arange(0.1, 1, 0.1)
        path_task1 = "data/semeval/truth/%s.txt" % lang
        path_task2 = "data/semeval/truth/%s.txt" % lang

        with open(path_task1) as fin:
            data = map(lambda s: s.strip().split("\t"), fin.readlines())
            targets, true_class = zip(*data)
            y_true = np.array(true_class, dtype=int)
        with open(path_task2) as fin:
            data = map(lambda s: s.strip().split("\t"), fin.readlines())
            _, true_ranking = zip(*data)
            true_ranking = np.array(true_ranking, dtype=float)

        corpus1_path = "wordvectors/semeval/%s-corpus1.vec" % lang
        corpus2_path = "wordvectors/semeval/%s-corpus2.vec" % lang
        wv1 = WordVectors(input_file=corpus1_path, normalized=normalized)
        wv2 = WordVectors(input_file=corpus2_path, normalized=normalized)

        c_method = defaultdict(list)
        wv1, wv2 = intersection(wv1, wv2)
        # print("Size of common vocab.", len(wv1))
        prediction = dict()  # store per-word prediction
        for align_method in align_methods:
            accuracies[align_method][lang] = list()
            true_positives[align_method][lang] = list()
            false_negatives[align_method][lang] = list()
            cm[align_method][lang] = np.zeros((2, 2))

            if align_method == "global":
                landmarks = wv1.words
            elif align_method == "noise-aware":
                Q, alpha, landmarks, non_landmarks = noise_aware(
                    wv1.vectors, wv2.vectors)
                landmarks = [wv1.words[i] for i in landmarks]
            elif align_method == "s4":
                landmarks, non_landmarks, Q = s4(
                    wv1,
                    wv2,
                    cls_model="nn",
                    verbose=0,
                    **align_params[lang],
                )
            elif align_method == "top-10":
                landmarks = wv1.words[int(len(wv1.words) * 0.1):]
            elif align_method == "top-5":
                landmarks = wv1.words[int(len(wv1.words) * 0.05):]
            elif align_method == "top-50":
                landmarks = wv1.words[int(len(wv1.words) * 0.50):]
            elif align_method == "bot-10":
                landmarks = wv1.words[-int(len(wv1.words) * 0.1):]
            elif align_method == "bot-5":
                landmarks = wv1.words[-int(len(wv1.words) * 0.05):]
            elif align_method == "bot-50":
                landmarks = wv1.words[-int(len(wv1.words) * 0.50):]

            wv1_, wv2_, Q = align(wv1, wv2, anchor_words=landmarks)

            # Cosine-based classifier
            if classifier == "cosine":
                x = np.array([cosine(wv1_[w], wv2_[w]) for w in wv1.words])
                x = get_feature_cdf(x)
                x = np.array([x[wv1.word_id[i.lower()]] for i in targets])
                p = x.reshape(-1, 1)
                r = vote(p)
                y_pred = r

                best_acc = 0
                for t in thresholds:
                    y_bin = (y_pred > t)
                    correct = (y_bin == y_true)

                    accuracy = accuracy_score(y_true, y_bin)
                    if accuracy > best_acc:
                        prediction[align_method] = correct
                        best_acc = accuracy
                    tn, fp, fn, tp = confusion_matrix(y_true, y_bin).ravel()
                    cm[align_method][lang] += confusion_matrix(y_true,
                                                               y_bin,
                                                               normalize="all")
                    accuracies[align_method][lang].append(round(accuracy, 2))
                    true_positives[align_method][lang].append(round(tp, 2))
                    false_negatives[align_method][lang].append(round(fn, 2))
            elif classifier == "cosine-auto":
                t_cos = threshold_crossvalidation(wv1_,
                                                  wv2_,
                                                  iters=1,
                                                  **auto_params[lang],
                                                  landmarks=landmarks)
                x = np.array([cosine(wv1_[w], wv2_[w]) for w in wv1.words])
                x = get_feature_cdf(x)
                x = np.array([x[wv1.word_id[i.lower()]] for i in targets])
                p = x.reshape(-1, 1)
                r = vote(p)
                y_pred = r
                y_bin = y_pred > t_cos
                correct = (y_bin == y_true)

                accuracy = accuracy_score(y_true, y_bin)

                accuracies[align_method][lang].append(round(accuracy, 2))

            elif classifier == "s4":
                model = s4(wv1_,
                           wv2_,
                           landmarks=landmarks,
                           verbose=0,
                           **cls_params[lang],
                           update_landmarks=False)
                # Concatenate vectors of target words for prediction
                x = np.array([
                    np.concatenate((wv1_[t.lower()], wv2_[t.lower()]))
                    for t in targets
                ])
                y_pred = model.predict(x)
                y_bin = y_pred > 0.5
                correct = (y_bin == y_true)

                accuracy = accuracy_score(y_true, y_bin)
                print(accuracy)
                accuracies[align_method][lang].append(round(accuracy, 2))

            c_method[align_method] = y_pred
            rho, pvalue = spearmanr(true_ranking, y_pred)

            # print(lang, align_method, "acc", accuracies[align_method][lang],
            #                                 "\nranking", round(rho, 2),
            #                                 "landmarks", len(landmarks))

    print("|Method|Language|Mean acc.|Max acc.|")
    print("|------|--------|---------|--------|")
    for method in accuracies:
        print("|", method, end="|")
        for lang in accuracies[method]:
            print(lang,
                  round(np.mean(accuracies[method][lang]), 2),
                  np.max(accuracies[method][lang]),
                  sep="|",
                  end="|\n")
    print()
Пример #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("alignment",
                        choices=[
                            'top-5', 'top-10', 'noise-aware', 'bot-5',
                            'bot-10', 'global', 's4'
                        ],
                        default="top",
                        help="Method to use in the alignment of UK to US")
    parser.add_argument("--rounds",
                        type=int,
                        default=1,
                        help="No. of rounds to run the classifications")

    args = parser.parse_args()

    path_us = "wordvectors/ukus/coca.vec"
    path_uk = "wordvectors/ukus/bnc.vec"
    path_dict = "data/ukus/dict_similar.txt"
    path_dict_dis = "data/ukus/dict_dissimilar.txt"

    normalized = False

    wv1 = WordVectors(input_file=path_uk, normalized=normalized)
    wv2 = WordVectors(input_file=path_us, normalized=normalized)

    wv_uk, wv_us = intersection(wv1, wv2)

    # Load dictionaries of words
    with open(path_dict) as fin:
        dico_sim = list(map(lambda s: s.strip().split(" ", 1),
                            fin.readlines()))

    with open(path_dict_dis) as fin:
        dico_dis = list(map(lambda s: (s.strip(), s.strip()), fin.readlines()))

    # Filter words not in the vocabulry of either UK or US corpora
    dico_sim = [(a, b) for a, b in dico_sim
                if a in wv_uk.word_id and b in wv_us.word_id]
    dico_dis = [(a, b) for a, b in dico_dis
                if a in wv_uk.word_id and b in wv_us.word_id]
    dico = dico_sim + dico_dis
    # Create true labels for terms
    # 0 -> similar | 1 -> dissimilar
    y_true = [0] * len(dico_sim) + [1] * len(dico_dis)

    m = args.alignment
    # Align wordvectors (using any alignment approach)
    if m == "noise-aware":
        Q, alpha, landmarks, noise = noise_aware(wv_uk.vectors, wv_us.vectors)
        landmarks = [wv_uk.words[i] for i in landmarks]
        a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)
    elif m == "global":
        landmarks = wv_us.words
        a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)
        landmarks = landmarks[:len(landmarks) // 2]
    elif m == "s4":
        landmarks = wv_us.words
        a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)
        landmarks, non_landmarks, Q = s4(
            wv_uk,
            wv_us,
            cls_model="nn",
            verbose=0,
            iters=100,
            n_targets=100,
            n_negatives=10,
            rate=0.25,
        )

        a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)
    elif m == "top-10":
        landmarks = wv_us.words[:int(len(wv_us.words) * 0.1)]
    elif m == "top-5":
        landmarks = wv_us.words[:int(len(wv_us.words) * 0.05)]
    elif m == "bot-10":
        landmarks = wv_us.words[-int(len(wv_us.words) * 0.1):]
    elif m == 'bot-5':
        landmarks = wv_us.words[-int(len(wv_us.words) * 0.05):]

    a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)

    wv1_ = WordVectors(words=wv1.words, vectors=np.dot(wv1.vectors, Q))

    test_pairs = dico
    # print("Landmarks", len(landmarks))
    # Train classifier
    self_scores = list()
    cos_scores = list()
    na_scores = list()
    iters = 100

    # Interval to vary cosine thresholds
    cos_thresholds = [0.3, 0.5, 0.7]

    # Run several rounds, if given
    for r in range(args.rounds):
        model = s4(a_,
                   b_,
                   iters=iters,
                   landmarks=landmarks,
                   verbose=0,
                   n_targets=1000,
                   n_negatives=1000,
                   rate=0.25,
                   cls_model="nn",
                   update_landmarks=False)

        acc = 0
        acc_cos = 0
        total = 0
        y_pred = list()
        y_pred_cos = list()
        try:
            x = np.array(
                [np.concatenate((wv1_[p[0]], wv2[p[1]])) for p in test_pairs])
            x_cos = np.array(
                [cosine(wv1_[p[0]], wv2[p[1]]) for p in test_pairs])

            # Predict with noise-aware
            # Generate pairs (u, v) and apply noise-aware
            # 0 if pair is clean, 1 if pair is noisy

            v_a = np.array([wv1_[p[0]] for p in test_pairs])
            v_b = np.array([wv2[p[1]] for p in test_pairs])
            Q, alpha, clean, noisy = noise_aware(v_a, v_b)

            y_pred_na = np.zeros((len(test_pairs)))
            for i in noisy:
                y_pred_na[i] = 1

        except KeyError as e:  # skip word if not in model
            pass
        y_hat = model.predict(x)
        y_pred = (y_hat > 0.5)

        self_acc = accuracy_score(y_true, y_pred)
        self_prec = precision_score(y_true, y_pred)
        self_rec = recall_score(y_true, y_pred)
        self_f1 = f1_score(y_true, y_pred)
        self_scores.append([self_acc, self_prec, self_rec, self_f1])

        # Cosine metrics
        # Compute average over multiple runs
        cos_acc = cos_prec = cos_rec = cos_f1 = 0
        for t in cos_thresholds:
            y_pred_cos = (x_cos > t)
            cos_acc = round(accuracy_score(y_true, y_pred_cos), 2)
            cos_prec = round(precision_score(y_true, y_pred_cos), 2)
            cos_rec = round(recall_score(y_true, y_pred_cos), 2)
            cos_f1 = round(f1_score(y_true, y_pred_cos), 2)

            cos_scores.append([cos_acc, cos_prec, cos_rec, cos_f1])

        # Noise-Aware metrics
        na_acc = round(accuracy_score(y_true, y_pred_na), 2)
        na_prec = round(precision_score(y_true, y_pred_na), 2)
        na_rec = round(recall_score(y_true, y_pred_na), 2)
        na_f1 = round(f1_score(y_true, y_pred_na), 2)
        na_scores.append([na_acc, na_prec, na_rec, na_f1])

    self_scores = np.array(self_scores)
    cos_scores = np.array(cos_scores)
    na_scores = np.array(na_scores)

    # Print Markdown Table
    for j, t in enumerate(cos_thresholds):
        print("|COS %.2f" % t, m, sep="|", end="|")
        for i in range(4):
            print("%.2f" % (round(cos_scores[j:, i].mean(), 2)),
                  end="|",
                  sep=" ")
        print("|")
    print("|")
    print("|S4-D", m, end="|", sep="|")
    for i in range(4):
        print("%.2f +- %.2f" % (round(self_scores[:, i].mean(),
                                      2), round(self_scores[:, i].std(), 2)),
              end="|",
              sep=" ")
    print("|")
    print("|Noisy-Pairs", "-", *na_scores[0], sep="|", end="|\n")