示例#1
0
def evaluate(frame, subset='val'):
    """
    Evaluate a DataFrame containing term vectors on its ability to predict term
    relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a
    VectorSpaceWrapper to fill missing vocabulary from ConceptNet.

    Return a Series containing these labeled results.
    """
    # Make subset names consistent with other datasets
    if subset == 'dev':
        subset = 'val'
    elif subset == 'all':
        # for the final evaluation, use just the test data
        subset = 'test'
    filename = get_support_data_filename('story-cloze/cloze_test_spring2016_%s.tsv' % subset)
    vectors = VectorSpaceWrapper(frame=frame)
    total = 0
    correct = 0
    for sentences, answers in read_cloze(filename):
        text = ' '.join(sentences)
        right_answer, wrong_answer = answers
        probe_vec = vectors.text_to_vector('en', text)
        right_vec = vectors.text_to_vector('en', right_answer)
        wrong_vec = vectors.text_to_vector('en', wrong_answer)

        right_sim = cosine_similarity(probe_vec, right_vec)
        wrong_sim = cosine_similarity(probe_vec, wrong_vec)
        if right_sim > wrong_sim:
            correct += 1
        total += 1
        # print("%+4.2f %s / %s / %s" % (right_sim - wrong_sim, text, right_answer, wrong_answer))
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
示例#2
0
def evaluate(frame, subset='val'):
    """
    Evaluate a DataFrame containing term vectors on its ability to predict term
    relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a
    VectorSpaceWrapper to fill missing vocabulary from ConceptNet.

    Return a Series containing these labeled results.
    """
    # Make subset names consistent with other datasets
    if subset == 'dev':
        subset = 'val'
    elif subset == 'all':
        # for the final evaluation, use just the test data
        subset = 'test'
    filename = get_support_data_filename(
        'story-cloze/cloze_test_spring2016_%s.tsv' % subset)
    vectors = VectorSpaceWrapper(frame=frame)
    total = 0
    correct = 0
    for sentences, answers in read_cloze(filename):
        text = ' '.join(sentences)
        right_answer, wrong_answer = answers
        probe_vec = vectors.text_to_vector('en', text)
        right_vec = vectors.text_to_vector('en', right_answer)
        wrong_vec = vectors.text_to_vector('en', wrong_answer)

        right_sim = cosine_similarity(probe_vec, right_vec)
        wrong_sim = cosine_similarity(probe_vec, wrong_vec)
        if right_sim > wrong_sim:
            correct += 1
        total += 1
        # print("%+4.2f %s / %s / %s" % (right_sim - wrong_sim, text, right_answer, wrong_answer))
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high],
                     index=['acc', 'low', 'high'])
示例#3
0
def make_replacements_faster(small_frame,
                             big_frame,
                             tree_depth=1000,
                             lang='en',
                             verbose=False):
    """
    Create a replacements dictionary to map terms only present in a big frame to the closest term
    in a small_frame. This is a faster than make_replacements(), because it uses a fast
    implementation of the approximate nearest neighbor algorithm.

    tree_depth=1000 provides a good balance of speed and accuracy.
    """
    intersected = big_frame.reindex(small_frame.index).dropna()
    index, index_map = build_annoy_tree(intersected, tree_depth)
    replacements = {}
    for term in big_frame.index:
        if term not in small_frame.index and not term.startswith('/x/'):
            most_similar_index = index.get_nns_by_vector(
                big_frame.loc[term], 1)[0]
            most_similar = index_map[most_similar_index]
            similarity = cosine_similarity(
                get_vector(big_frame, term, lang),
                get_vector(small_frame, most_similar, lang))
            replacements[term] = [most_similar, round(similarity, 2)]

            if verbose and not (len(replacements) % 20):
                print('{} ==> {}, {}'.format(term, most_similar, similarity))
    return replacements
示例#4
0
def spearman_evaluate(vectors, standard, language='en', verbose=0):
    """
    Tests assoc_space's ability to recognize word correlation. This function
    computes the spearman correlation between assoc_space's reported word
    correlation and the expected word correlation according to 'standard'.
    """
    gold_scores = []
    our_scores = []

    for term1, term2, gold_score in standard:
        uri1 = standardized_uri(language, term1)
        uri2 = standardized_uri(language, term2)
        if isinstance(vectors, VectorSpaceWrapper):
            our_score = vectors.get_similarity(uri1, uri2)
        else:
            our_score = cosine_similarity(get_vector(vectors, uri1),
                                          get_vector(vectors, uri2))
        if verbose > 1:
            print('%s\t%s\t%3.3f\t%3.3f' %
                  (term1, term2, gold_score, our_score))
        gold_scores.append(gold_score)
        our_scores.append(our_score)

    correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0]

    if verbose:
        print("Spearman correlation: %s" % (correlation, ))

    return confidence_interval(correlation, len(gold_scores))
示例#5
0
def spearman_evaluate(vectors, standard, language='en', verbose=0):
    """
    Tests assoc_space's ability to recognize word correlation. This function
    computes the spearman correlation between assoc_space's reported word
    correlation and the expected word correlation according to 'standard'.
    """
    gold_scores = []
    our_scores = []

    for term1, term2, gold_score in standard:
        uri1 = standardized_uri(language, term1)
        uri2 = standardized_uri(language, term2)
        if isinstance(vectors, VectorSpaceWrapper):
            our_score = vectors.get_similarity(uri1, uri2)
        else:
            our_score = cosine_similarity(get_vector(vectors, uri1), get_vector(vectors, uri2))
        if verbose > 1:
            print('%s\t%s\t%3.3f\t%3.3f' % (term1, term2, gold_score, our_score))
        gold_scores.append(gold_score)
        our_scores.append(our_score)

    correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0]

    if verbose:
        print("Spearman correlation: %s" % (correlation,))

    return confidence_interval(correlation, len(gold_scores))
示例#6
0
 def get_similarity(self, query1, query2):
     vec1 = self.get_vector(query1)
     vec2 = self.get_vector(query2)
     return cosine_similarity(vec1, vec2)
示例#7
0
 def get_similarity(self, query1, query2):
     vec1 = self.get_vector(query1)
     vec2 = self.get_vector(query2)
     return cosine_similarity(vec1, vec2)
            # conceptnet5.uri.concept_uri("en",row[0].lower())
            idx1 = conceptnet5.uri.concept_uri("en", row[0].lower())
            idx2 = conceptnet5.uri.concept_uri("en", row[1].lower())
            try:
                mw1 = retrowords.loc[idx1]
            except Exception as e:
                mw1 = ft_model.get_word_vector(row[0].lower())
                mw1 = np.array(retrogan.predict(mw1.reshape(1, 300))).reshape(
                    (300, ))
            try:
                mw2 = retrowords.loc[idx2]
            except:
                mw2 = ft_model.get_word_vector(row[1].lower())
                mw2 = np.array(retrogan.predict(mw2.reshape(1, 300))).reshape(
                    (300, ))
            score = cosine_similarity(mw1, mw2)

            my_word_tuples.append((row[0], row[1], score))
            try:
                idx1 = "/c/en/" + row[0].lower()
                idx2 = "/c/en/" + row[1].lower()
                nw1 = numberbatch.loc[idx1]
                nw2 = numberbatch.loc[idx2]
                score = cosine_similarity(nw1, nw2)
            except Exception as e:
                print("Not found for")
                print(e)
                # print(row[0])
                # print(row[1])
                score = 0
            nb_word_tuples.append((row[0], row[1], score))
            # score = cosine_similarity(mw1,mw2)

            # my_word_tuples.append((row[0],row[1],score))
            try:
                #     idx1 = "/c/en/" + row[0].lower()
                # #     idx2 = "/c/en/" + row[1].lower()
                nw1 = retrowords.loc[idx1]
            except Exception as e:
                print("Not found for")
                print(e)
                score = 0
                missed_words.add(row[0].lower())
            try:
                nw2 = retrowords.loc[idx2]
                score = cosine_similarity(nw1, nw2)
            except Exception as e:
                print("Not found for")
                missed_words.add(row[1].lower())
                score = 0
                print(e)
                # print(row[0])
                # print(row[1])
                score = 0
            nb_word_tuples.append((row[0], row[1], score))
        print(f'Processed {line_count} lines.')
    print(len(missed_words))
    print(missed_words)
    # print(pearsonr([float(x[2]) for x in word_tuples],[float(x[2]) for x in my_word_tuples]))
    # print(spearmanr([x[2] for x in word_tuples],[x[2] for x in my_word_tuples]))
    print("For the attract-repelDS")
示例#10
0
            idx2 = conceptnet5.uri.concept_uri("en", row[1].lower())
            try:
                mw1 = retrowords.loc[idx1]
            except Exception as e:
                missed_words.add(row[0].lower())
                mw1 = ft_model.get_word_vector(row[0].lower())
                mw1 = np.array(retrogan.predict(mw1.reshape(1, 300))).reshape(
                    (300, ))
            try:
                mw2 = retrowords.loc[idx2]
            except:
                missed_words.add(row[1].lower())
                mw2 = ft_model.get_word_vector(row[1].lower())
                mw2 = np.array(retrogan.predict(mw2.reshape(1, 300))).reshape(
                    (300, ))
            score = cosine_similarity(mw1, mw2)

            my_word_tuples.append((row[0], row[1], score))
            # try:
            # #     idx1 = "/c/en/" + row[0].lower()
            # #     idx2 = "/c/en/" + row[1].lower()
            #     nw1 = numberbatch.loc[idx1]
            #     nw2 = numberbatch.loc[idx2]
            #     score = cosine_similarity(nw1,nw2)
            # except Exception as e:
            #     print("Not found for")
            #     print(e)
            #     # print(row[0])
            #     # print(row[1])
            #     score = 0
            # nb_word_tuples.append((row[0], row[1], score))