예제 #1
0
def add_cos(va, vb, vc, vd):
    """
    Uses the following formula for scoring:
    cos(vb - va + vc, vd)
    """
    x = normalize(vb - va + vc)
    return cosine(x, vd)
예제 #2
0
def eval_diff_vect(WR):
    """
    Uses the DiffVect dataset for performing 1-NN relation classification.
    We will use PairDiff to create a vector for a word-pair and then measure the similarity
    between the target pair and the reamining word-pairs in the dataset.
    If the 1-NN has the same relation label as the target pair, then we consider it to
    be a correct match. We compute accuracy = correct_matches / total_instances.
    """
    analogy_file = open(os.path.join(pkg_dir, "../benchmarks/diff-vec"))
    relation = {}
    pairs = []
    label = ""
    while 1:
        line = analogy_file.readline()
        if len(line) == 0:
            break
        if line.startswith(':'):  # This is a label
            label = line.split(':')[1].strip()
        else:
            p = line.strip().split()
            (a, b) = p
            pairs.append((a, b))
            relation[(a, b)] = label
    analogy_file.close()
    n = len(pairs)
    M = numpy.zeros((n, WR.dim), dtype=numpy.float64)
    for (i, (a, b)) in enumerate(pairs):
        M[i, :] = normalize(get_embedding(a, WR) - get_embedding(b, WR))
    S = numpy.dot(M, M.T)
    preds = (-S).argsort()[:, 1]
    corrects = sum(
        [relation[pairs[i]] == relation[pairs[preds[i]]] for i in range(n)])
    accuracy = float(100 * corrects) / float(n)
    print "DiffVec Accuracy =", accuracy
    return accuracy
예제 #3
0
def evaluate_embeddings(embed_fname, dim, res_fname):
    """
    This function can be used to evaluate an embedding.
    """
    res = {}
    WR = WordReps()
    # We will load vectors only for the words in the benchmarks.
    words = set()
    with open(os.path.join(pkg_dir, "../benchmarks/all_words.txt")) as F:
        for line in F:
            words.add(line.strip())
    WR.read_model(embed_fname, dim, words)

    # semantic similarity benchmarks.
    benchmarks = ["ws", "rg", "mc", "rw", "scws", "men", "simlex"]
    for bench in benchmarks:
        (corr, sig) = get_correlation(
            os.path.join(pkg_dir, "../benchmarks/%s_pairs.txt" % bench),
            WR.vects, "spearman")
        print "%s = %f" % (bench, corr)
        res[bench] = corr

    cands = list(words)
    M = numpy.zeros((len(cands), WR.dim), dtype=numpy.float64)
    for (i, w) in enumerate(cands):
        M[i, :] = normalize(get_embedding(w, WR))

    # word analogy benchmarks.
    res["Google_res"] = eval_Google_Analogies(WR, M, cands)
    res["MSR_res"] = eval_MSR_Analogies(WR, M, cands)
    res["SemEval_res"] = eval_SemEval(WR, "CosAdd")
    res["DiffVec_acc"] = eval_diff_vect(WR)
    #res["SAT_res"] = eval_SAT_Analogies(WR, scoring_method)

    res_file = open(res_fname, 'w')
    res_file.write(
        "#RG, MC, WS, RW, SCWS, MEN, SimLex, sem, syn, total, SemEval, MSR, DiffVec\n"
    )
    res_file.write("%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f\n" %
                   (res["rg"], res["mc"], res["ws"], res["rw"], res["scws"],
                    res["men"], res["simlex"], res["Google_res"]["semantic"],
                    res["Google_res"]["syntactic"], res["Google_res"]["total"],
                    res["SemEval_res"]["acc"], res["MSR_res"]["accuracy"],
                    res["DiffVec_acc"]))
    res_file.close()
    return res
예제 #4
0
def eval_MSR_Analogies(WR, M, cands):
    """
    Evaluate the accuracy of the learnt vectors on the analogy task using MSR dataset. 
    We consider the set of fourth words in the test dataset as the
    candidate space for the correct answer.
    """
    analogy_file = open(
        os.path.join(pkg_dir, "../benchmarks/msr-analogies.txt"))
    questions = []
    total_questions = 0
    corrects = 0
    while 1:
        line = analogy_file.readline()
        if len(line) == 0:
            break
        p = line.strip().split()
        total_questions += 1
        questions.append((p[0], p[1], p[2], p[3]))
    analogy_file.close()

    print "== MSR Analogy Dataset =="
    print "Total no. of questions =", len(questions)
    print "Total no. of candidates =", len(cands)

    # predict the fourth word for each question.
    count = 1
    for (a, b, c, d) in questions:
        print "%d / %d" % (count, len(questions)), "\r",
        count += 1
        # set of candidates for the current question are the fourth
        # words in all questions, except the three words for the current question.
        scores = []
        va = get_embedding(a, WR)
        vb = get_embedding(b, WR)
        vc = get_embedding(c, WR)
        x = normalize(vb - va + vc)
        s = numpy.dot(M, x)
        nns = [cands[i] for i in (-s).argsort()[:4]]
        nns = filter(lambda y: y not in [a, b, c], nns)
        if nns[0] == d:
            corrects += 1
    accuracy = float(corrects) / float(len(questions))
    print "MSR accuracy =", accuracy
    return {"accuracy": accuracy}
예제 #5
0
def PairDiff(va, vb, vc, vd):
    """
    Uses the following formula for scoring:
    cos(vd - vc, vb - va)
    """
    return cosine(normalize(vd - vc), normalize(vb - va))
예제 #6
0
def subt_cos(va, vb, vc, vd):
    """
    Uses the following formula for scoring:
    cos(va - vc, vb - vd)
    """
    return cosine(normalize(va - vc), normalize(vb - vd))
예제 #7
0
def eval_Google_Analogies(WR, M, cands):
    """
    Evaluate the accuracy of the learnt vectors on the analogy task. 
    We consider the set of fourth words in the test dataset as the
    candidate space for the correct answer.
    """
    analogy_file = open(
        os.path.join(pkg_dir, "../benchmarks/google-analogies.txt"))
    questions = collections.OrderedDict()
    total_questions = {}
    corrects = {}
    while 1:
        line = analogy_file.readline()
        if len(line) == 0:
            break
        if line.startswith(':'):  # This is a label
            label = line.split(':')[1].strip()
            questions[label] = []
            total_questions[label] = 0
            corrects[label] = 0
        else:
            p = line.strip().split()
            total_questions[label] += 1
            questions[label].append((p[0], p[1], p[2], p[3]))
    analogy_file.close()

    print "== Google Analogy Dataset =="
    print "Total no. of question types =", len(questions)
    print "Total no. of candidates =", len(cands)

    # predict the fourth word for each question.
    count = 1
    for label in questions:
        for (a, b, c, d) in questions[label]:
            print "%d%% (%d / %d)" % ((100 * count) / float(valid_questions),
                                      count, valid_questions), "\r",
            count += 1
            va = get_embedding(a, WR)
            vb = get_embedding(b, WR)
            vc = get_embedding(c, WR)
            x = normalize(vb - va + vc)
            s = numpy.dot(M, x)
            nns = [cands[i] for i in (-s).argsort()[:4]]
            nns = filter(lambda y: y not in [a, b, c], nns)
            if nns[0] == d:
                corrects[label] += 1

    # Compute accuracy
    n = semantic_total = syntactic_total = semantic_corrects = syntactic_corrects = 0
    for label in total_questions:
        n += total_questions[label]
        if label.startswith("gram"):
            syntactic_total += total_questions[label]
            syntactic_corrects += corrects[label]
        else:
            semantic_total += total_questions[label]
            semantic_corrects += corrects[label]
    print "Percentage of questions attempted = %f (%d / %d)" % (
        (100 * valid_questions) / float(n), valid_questions, n)
    for label in questions:
        acc = float(100 * corrects[label]) / float(total_questions[label])
        print "%s = %f (correct = %d, attempted = %d, total = %d)" % (
            label, acc, corrects[label], len(
                questions[label]), total_questions[label])
    semantic_accuracy = float(100 * semantic_corrects) / float(semantic_total)
    syntactic_accuracy = float(
        100 * syntactic_corrects) / float(syntactic_total)
    total_corrects = semantic_corrects + syntactic_corrects
    accuracy = float(100 * total_corrects) / float(n)
    print "Semantic Accuracy =", semantic_accuracy
    print "Syntactic Accuracy =", syntactic_accuracy
    print "Total accuracy =", accuracy
    return {
        "semantic": semantic_accuracy,
        "syntactic": syntactic_accuracy,
        "total": accuracy
    }