Пример #1
0
def spearman_evaluate(vectors, standard, language='en', verbose=0):
    """
    Tests assoc_space's ability to recognize word correlation. This function
    computes the spearman correlation between assoc_space's reported word
    correlation and the expected word correlation according to 'standard'.
    """
    gold_scores = []
    our_scores = []

    for term1, term2, gold_score in standard:
        uri1 = standardized_uri(language, term1)
        uri2 = standardized_uri(language, term2)
        if isinstance(vectors, VectorSpaceWrapper):
            our_score = vectors.get_similarity(uri1, uri2)
        else:
            our_score = cosine_similarity(get_vector(vectors, uri1), get_vector(vectors, uri2))
        if verbose > 1:
            print('%s\t%s\t%3.3f\t%3.3f' % (term1, term2, gold_score, our_score))
        gold_scores.append(gold_score)
        our_scores.append(our_score)

    correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0]

    if verbose:
        print("Spearman correlation: %s" % (correlation,))

    return confidence_interval(correlation, len(gold_scores))
Пример #2
0
def spearman_evaluate(vectors, standard, language='en', verbose=0):
    """
    Tests assoc_space's ability to recognize word correlation. This function
    computes the spearman correlation between assoc_space's reported word
    correlation and the expected word correlation according to 'standard'.
    """
    gold_scores = []
    our_scores = []

    for term1, term2, gold_score in standard:
        uri1 = standardized_uri(language, term1)
        uri2 = standardized_uri(language, term2)
        if isinstance(vectors, VectorSpaceWrapper):
            our_score = vectors.get_similarity(uri1, uri2)
        else:
            our_score = cosine_similarity(get_vector(vectors, uri1),
                                          get_vector(vectors, uri2))
        if verbose > 1:
            print('%s\t%s\t%3.3f\t%3.3f' %
                  (term1, term2, gold_score, our_score))
        gold_scores.append(gold_score)
        our_scores.append(our_score)

    correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0]

    if verbose:
        print("Spearman correlation: %s" % (correlation, ))

    return confidence_interval(correlation, len(gold_scores))
Пример #3
0
def measure_bias(frame):
    """
    Return a DataFrame that measures biases in a semantic space, on four
    data sets:

    - Gender
    - Fine-grained ethnicity
    - Coarse-grained ethnicity
    - Religious beliefs
    """
    vsw = VectorSpaceWrapper(frame=frame)
    vsw.load()

    gender_binary_axis = normalize_vec(
        get_category_axis(frame, FEMALE_WORDS) -
        get_category_axis(frame, MALE_WORDS))
    gender_bias_numbers = []
    for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS:
        female_biased_uri = standardized_uri('en', female_biased_word)
        male_biased_uri = standardized_uri('en', male_biased_word)
        diff = normalize_vec(
            vsw.get_vector(female_biased_uri) -
            vsw.get_vector(male_biased_uri)).dot(gender_binary_axis)
        gender_bias_numbers.append(diff)

    mean = np.mean(gender_bias_numbers)
    sem = scipy.stats.sem(gender_bias_numbers)
    gender_bias = pd.Series([mean, mean - sem * 2, mean + sem * 2],
                            index=['bias', 'low', 'high'])

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_ETHNICITY)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    fine_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, COARSE_ETHNICITY_TERMS)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    coarse_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = pd.DataFrame(
        np.vstack(
            [get_category_axis(frame, names) for names in ETHNIC_NAME_SETS]))
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    name_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_BELIEF)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, BELIEF_STEREOTYPE_TERMS)
    belief_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    return pd.DataFrame({
        'gender': gender_bias,
        'ethnicity-fine': fine_ethnic_bias,
        'ethnicity-coarse': coarse_ethnic_bias,
        'ethnicity-names': name_ethnic_bias,
        'beliefs': belief_bias
    }).T
Пример #4
0
def read_bats(category):
    """
    Read BATS dataset pairs for a specific category. Turn them into questions.

    For some questions, BATS contains multiple answers. For example, the answer to an
    analogy question Nicaragua:Spanish::Switzerland:? could be German, French, or Italian. These
    will all be supplied as a list if they are an answer (b2). However, if they are a part of a
    question (b1), only the first one will be used.
    """
    filename = 'bats/{}.txt'.format(category)
    pairs = []
    with open(get_support_data_filename(filename)) as file:
        for line in file:
            if '\t' in line:
                left, right = line.lower().split('\t')
            else:
                left, right = line.lower().split()
            right = right.strip()
            if '/' in right:
                right = [i.strip() for i in right.split('/')]
            else:
                right = [i.strip() for i in right.split(',')]
            pairs.append([left, right])

    quads = []
    for i in range(len(pairs)):
        first_pair = pairs[i]
        first_pair[1] = first_pair[1][
            0]  # select only one term for b1, even if more may be available
        second_pairs = [pair for j, pair in enumerate(pairs) if j != i]
        for second_pair in second_pairs:
            quad = []

            # the first three elements of a quad are the two terms in first_pair and the first
            # term of the second_pair
            quad.extend([
                standardized_uri('en', term)
                for term in first_pair + second_pair[:1]
            ])

            # if the second element of the second pair (b2) is a list, it means there are multiple
            # correct answers for b2. We want to keep all of them.
            if isinstance(second_pair[1], list):
                quad.append(
                    [standardized_uri('en', term) for term in second_pair[1]])
            else:
                quad.append(standardized_uri('en', second_pair[1]))
            quads.append(quad)
    return quads
Пример #5
0
def read_turney_analogies(filename):
    """
    Read Turney and Littman's dataset of SAT analogy questions. This data
    requires permission to redistribute, so you have to ask Peter Turney
    for the file.
    """
    questions = []
    question_lines = []
    with open(filename, encoding='utf-8') as file:
        for line in file:
            line = line.rstrip()
            if line and not line.startswith('#'):
                if len(line) == 1:
                    # A single letter on a line indicates the answer to a question.
                    answer_index = ord(line) - ord('a')

                    # Line 0 is a header we can discard.
                    raw_pairs = [qline.split(' ')[:2] for qline in question_lines[1:]]
                    concept_pairs = [tuple(standardized_uri('en', term) for term in pair) for pair in raw_pairs]

                    # The first of the pairs we got is the prompt pair. The others are
                    # answers (a) through (e).
                    questions.append((concept_pairs[0], concept_pairs[1:], answer_index))
                    question_lines.clear()
                else:
                    question_lines.append(line)
    return questions
Пример #6
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Пример #7
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Пример #8
0
def get_vocabulary_vectors(frame, vocab):
    """
    Given a vocabulary (as a list of English terms), get a sub-frame of the
    given DataFrame containing just the known vectors for that vocabulary.
    """
    uris = [standardized_uri('en', term) for term in vocab]
    return frame.reindex(uris).dropna()
Пример #9
0
def get_vocabulary_vectors(frame, vocab):
    """
    Given a vocabulary (as a list of English terms), get a sub-frame of the
    given DataFrame containing just the known vectors for that vocabulary.
    """
    uris = [standardized_uri('en', term) for term in vocab]
    return frame.reindex(uris).dropna()
Пример #10
0
def read_turney_analogies(filename):
    """
    Read Turney and Littman's dataset of SAT analogy questions. This data
    requires permission to redistribute, so you have to ask Peter Turney
    for the file.
    """
    questions = []
    question_lines = []
    with open(filename, encoding='utf-8') as file:
        for line in file:
            line = line.rstrip()
            if line and not line.startswith('#'):
                if len(line) == 1:
                    # A single letter on a line indicates the answer to a question.
                    answer_index = ord(line) - ord('a')

                    # Line 0 is a header we can discard.
                    raw_pairs = [
                        qline.split(' ')[:2] for qline in question_lines[1:]
                    ]
                    concept_pairs = [
                        tuple(standardized_uri('en', term) for term in pair)
                        for pair in raw_pairs
                    ]

                    # The first of the pairs we got is the prompt pair. The others are
                    # answers (a) through (e).
                    questions.append(
                        (concept_pairs[0], concept_pairs[1:], answer_index))
                    question_lines.clear()
                else:
                    question_lines.append(line)

    return questions
Пример #11
0
def get_category_axis(frame, category_examples):
    """
    Get a vector representing the average of several example terms, where
    the terms are specified as plain English text.
    """
    return get_weighted_vector(frame, [(standardized_uri('en', term), 1.)
                                       for term in category_examples])
Пример #12
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms (e.g. of the
    form 'en/term') to one whose row labels are standardized ConceptNet URIs
    (e.g. of the form '/c/en/term'; and with some extra word2vec-style
    normalization of digits). Rows whose labels get the same standardized
    URI get combined, with earlier rows given more weight.
    """
    # Check for en/term format we use to train fastText on OpenSubtitles data
    if all(label.count('/') == 1 for label in frame.index[0:5]):
        tuples = [label.partition('/') for label in frame.index]
        frame.index = [
            uri_prefix(standardized_uri(language, text))
            for language, _slash, text in tuples
        ]

    # Re-label the DataFrame with standardized, non-unique row labels
    frame.index = [
        uri_prefix(standardized_uri(language, label)) for label in frame.index
    ]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(1, nrows + 1)
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights,
                          axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[
                    lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort_values(inplace=True, ascending=False)
    result = scaled.loc[combined_weights.index]
    return result
Пример #13
0
def read_google_analogies(filename):
    """
    Read the 'questions-words.txt' file that comes with the word2vec package.
    """
    quads = [[
        standardized_uri('en', term) for term in line.rstrip().split(' ')
    ] for line in open(filename, encoding='utf-8') if not line.startswith(':')]
    return quads
Пример #14
0
 def text_to_vector(self, language, text):
     """
     Used in Story Cloze Test to create a vector for text.
     """
     tokens = wordfreq.simple_tokenize(text)
     weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.)
                       for token in tokens]
     return self.get_vector(weighted_terms, oov_vector=False)
Пример #15
0
def de_bias_category(frame, category_examples, bias_examples):
    """
    Remove correlations between a class of words that should have biases
    removed (category_examples) and a set of words reflecting those biases
    (bias_examples). For example, the `category_examples` may be ethnicities,
    and `bias_examples` may be stereotypes about them.

    The check for whether a word should be de-biased works like
    `de_bias_binary`, where the category words are positive examples and the
    bias words are negative examples (because the words that define the bias
    presumably should not be de-biased).

    The words that should be de-biased will have their correlations with
    each of the bias words removed.
    """
    # Make an SVM that distinguishes words that are in the category to be
    # de-biased from words that are not.
    category_predictor = two_class_svm(frame, category_examples, bias_examples)

    # Predict the probability of each word in the vocabulary being in the
    # category.  This is done on shards, to reduce peak memory consumption.
    applicability = np.zeros(shape=(len(frame), ), dtype=np.float32)
    for shard_start, shard_end in make_shard_endpoints(len(frame)):
        applicability[
            shard_start:shard_end] = category_predictor.predict_proba(
                frame[shard_start:shard_end])[:, 1]
    del category_predictor

    # Make a matrix of vectors representing the correlations to remove.
    vocab = [standardized_uri('en', term) for term in bias_examples]
    components_to_reject = frame.reindex(vocab).dropna().values

    # Make a modified version of the space that projects the bias vectors to 0.
    # Then weight each row of that space by "applicability", the probability
    # that each row should be de-biased.  This is also done on shards.
    for shard_start, shard_end in make_shard_endpoints(len(frame)):
        shard_len = shard_end - shard_start
        modified_component = reject_subspace(
            frame[shard_start:shard_end],
            components_to_reject).mul(applicability[shard_start:shard_end],
                                      axis=0).values

        # Make another component representing the vectors that should not be
        # de-biased: the original space times (1 - applicability).
        np.multiply(1 - applicability[shard_start:shard_end].reshape(
            (shard_len, 1)),
                    frame.values[shard_start:shard_end, :],
                    out=frame.values[shard_start:shard_end, :])

        # The sum of these two components is the de-biased space, where
        # de-biasing applies to each row proportional to its applicability.
        np.add(frame.values[shard_start:shard_end, :],
               modified_component,
               out=frame.values[shard_start:shard_end, :])
        del modified_component

    # L_2-normalize the resulting rows in-place.
    normalize(frame.values, norm='l2', copy=False)
Пример #16
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms (e.g. of the
    form 'en/term') to one whose row labels are standardized ConceptNet URIs
    (e.g. of the form '/c/en/term'; and with some extra word2vec-style
    normalization of digits). Rows whose labels get the same standardized
    URI get combined, with earlier rows given more weight.
    """
    # Check for en/term format we use to train fastText on OpenSubtitles data
    if all(label.count('/') == 1 for label in frame.index[0:5]):
        tuples = [label.partition('/') for label in frame.index]
        frame.index = [
            uri_prefix(standardized_uri(language, text))
            for language, _slash, text in tuples
        ]

    # Re-label the DataFrame with standardized, non-unique row labels
    frame.index = [
        uri_prefix(standardized_uri(language, label)) for label in frame.index
    ]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(1, nrows + 1)
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort_values(inplace=True, ascending=False)
    result = scaled.loc[combined_weights.index]
    return result
Пример #17
0
 def text_to_vector(self, language, text):
     """
     Used in Story Cloze Test to create a vector for text.
     """
     tokens = wordfreq.tokenize(text, language)
     weighted_terms = [
         (uri_prefix(standardized_uri(language, token)), 1.) for token in tokens
     ]
     return self.get_vector(weighted_terms, oov_vector=False)
Пример #18
0
def get_category_axis(frame, category_examples):
    """
    Get a vector representing the average of several example terms, where
    the terms are specified as plain English text.
    """
    return get_weighted_vector(
        frame,
        [(standardized_uri('en', term), 1.)
         for term in category_examples]
    )
Пример #19
0
def read_google_analogies(filename):
    """
    Read the 'questions-words.txt' file that comes with the word2vec package.
    """
    quads = [
        [standardized_uri('en', term) for term in line.rstrip().split(' ')]
        for line in open(filename, encoding='utf-8')
        if not line.startswith(':')
    ]
    return quads
Пример #20
0
def de_bias_category(frame, category_examples, bias_examples):
    """
    Remove correlations between a class of words that should have biases
    removed (category_examples) and a set of words reflecting those biases
    (bias_examples). For example, the `category_examples` may be ethnicities,
    and `bias_examples` may be stereotypes about them.

    The check for whether a word should be de-biased works like
    `de_bias_binary`, where the category words are positive examples and the
    bias words are negative examples (because the words that define the bias
    presumably should not be de-biased).

    The words that should be de-biased will have their correlations with
    each of the bias words removed.
    """
    # Make an SVM that distinguishes words that are in the category to be
    # de-biased from words that are not.
    category_predictor = two_class_svm(frame, category_examples, bias_examples)

    # Predict the probability of each word in the vocabulary being in the
    # category.  This is done on shards, to reduce peak memory consumption.
    applicability = np.zeros(shape=(len(frame),), dtype=np.float32)
    for shard_start, shard_end in make_shard_endpoints(len(frame)):
        applicability[shard_start:shard_end] = category_predictor.predict_proba(
            frame[shard_start:shard_end])[:, 1]
    del category_predictor

    # Make a matrix of vectors representing the correlations to remove.
    vocab = [
        standardized_uri('en', term) for term in bias_examples
    ]
    components_to_reject = frame.reindex(vocab).dropna().values

    # Make a modified version of the space that projects the bias vectors to 0.
    # Then weight each row of that space by "applicability", the probability
    # that each row should be de-biased.  This is also done on shards.
    modified_component = np.zeros(shape=frame.values.shape, dtype=np.float32)
    for shard_start, shard_end in make_shard_endpoints(len(frame)):
        modified_component[shard_start:shard_end, :] = \
            reject_subspace(frame[shard_start:shard_end], components_to_reject).mul(
                applicability[shard_start:shard_end], axis=0).values
    del components_to_reject

    # Make another component representing the vectors that should not be
    # de-biased: the original space times (1 - applicability).
    np.multiply(1 - applicability.reshape((len(frame), 1)), frame.values,
                out=frame.values)

    # The sum of these two components is the de-biased space, where de-biasing
    # applies to each row proportional to its applicability.
    np.add(frame.values, modified_component, out=frame.values)
    del modified_component

    # L_2-normalize the resulting rows in-place.
    normalize(frame.values, norm='l2', copy=False)
Пример #21
0
def test_load_vectors():
    vectors = ai._load_vectors()
    ok_(vectors.frame.index[0].startswith('/c/en/'))

    # Test we have vectors for all Codenames words
    wordlist = [
        standardized_uri('en', line.strip()) for line in open(
            resource_filename('codenames', 'data/codenames-words.txt'))
    ]
    for word in wordlist:
        ok_(word in vectors.frame.index)
Пример #22
0
def choose_vocab(quads, vocab_size):
    """
    Google and Bats analogies are not multiple-choice; instead, you're supposed to pick
    the best match out of your vector space's entire vocabulary, excluding the
    three words used in the prompt. The vocabulary size can matter a lot: Set
    it too high and you'll get low-frequency words that the data set wasn't
    looking for as answers. Set it too low and the correct answers won't be
    in the vocabulary.

    Set vocab_size='cheat' to see the results for an unrealistically optimal
    vocabulary (the vocabulary of the set of answer words).
    """
    if vocab_size == 'cheat':
        vocab = [
            standardized_uri('en', word)
            for word in sorted(set([quad[3] for quad in quads]))
        ]
    else:
        vocab = [
            standardized_uri('en', word)
            for word in wordfreq.top_n_list('en', vocab_size)
        ]
    return vocab
Пример #23
0
def _load_vectors():
    frame = load_hdf(resource_filename('codenames', 'data/mini.h5'))
    selections = [
        label for label in frame.index
        if label.startswith('/c/en/') and '_' not in label and '#' not in label
        and wordfreq.zipf_frequency(label[6:], 'en') > 3.0
    ]
    # Make sure all the words in Codenames are represented
    wordlist = [
        standardized_uri('en', line.strip()) for line in open(
            resource_filename('codenames', 'data/codenames-words.txt'))
    ]
    additions = [word for word in wordlist if word not in selections]
    selections += additions
    frame = l2_normalize_rows(frame.loc[selections].astype('f'))
    return VectorSpaceWrapper(frame=frame)
Пример #24
0
def de_bias_category(frame, category_examples, bias_examples):
    """
    Remove correlations between a class of words that should have biases
    removed (category_examples) and a set of words reflecting those biases
    (bias_examples). For example, the `category_examples` may be ethnicities,
    and `bias_examples` may be stereotypes about them.

    The check for whether a word should be de-biased works like
    `de_bias_binary`, where the category words are positive examples and the
    bias words are negative examples (because the words that define the bias
    presumably should not be de-biased).

    The words that should be de-biased will have their correlations with
    each of the bias words removed.
    """
    # Make an SVM that distinguishes words that are in the category to be
    # de-biased from words that are not.
    category_predictor = two_class_svm(frame, category_examples, bias_examples)

    # Predict the probability of each word in the vocabulary being in the
    # category.
    applicability = category_predictor.predict_proba(frame)[:, 1]
    del category_predictor

    # Make a matrix of vectors representing the correlations to remove.
    vocab = [standardized_uri('en', term) for term in bias_examples]
    components_to_reject = frame.reindex(vocab).dropna().values

    # Make a modified version of the space that projects the bias vectors to 0.
    # Then weight each row of that space by "applicability", the probability
    # that each row should be de-biased.
    modified_component = reject_subspace(frame, components_to_reject).mul(
        applicability, axis=0)
    del components_to_reject

    # Make another component representing the vectors that should not be
    # de-biased: the original space times (1 - applicability).
    result = frame.mul(1 - applicability, axis=0)

    # The sum of these two components is the de-biased space, where de-biasing
    # applies to each row proportional to its applicability.
    np.add(result.values, modified_component.values, out=result.values)
    del modified_component

    # L_2-normalize the resulting rows in-place.
    normalize(result.values, norm='l2', copy=False)
    return result
Пример #25
0
 def text_to_vector(self, language, text):
     # TODO: docstring -- is this only used for Story Cloze Test?
     tokens = wordfreq.tokenize(text, language)
     weighted_terms = [(standardized_uri(language, token), 1.)
                       for token in tokens]
     return self.get_vector(weighted_terms, include_neighbors=False)
Пример #26
0
def eval_semeval2012_analogies(vectors, weight_direct, weight_transpose,
                               subset, subclass):
    """
    For a set of test pairs:
        * Compute a Spearman correlation coefficient between the ranks produced by vectors and
           gold ranks.
        * Compute an accuracy score of answering MaxDiff questions.
    """
    train_pairs = read_train_pairs_semeval2012(subset, subclass)
    test_questions = read_test_questions_semeval2012(subset, subclass)
    pairqnum2least, pairqnum2most = read_turk_answers_semeval2012(
        subset, subclass, test_questions)
    turk_rank = read_turk_ranks_semeval2012(subset, subclass)
    pairs_to_rank = [pair for pair, score in turk_rank]

    # Assign a score to each pair, according to pairwise_analogy_func
    our_pair_scores = {}
    for pair in pairs_to_rank:
        rank_pair_scores = []
        for train_pair in train_pairs:
            pair_to_rank = pair.strip().replace('"', '').split(':')
            score = pairwise_analogy_func(
                vectors, standardized_uri('en', train_pair[0]),
                standardized_uri('en', train_pair[1]),
                standardized_uri('en', pair_to_rank[0]),
                standardized_uri('en', pair_to_rank[1]), weight_direct,
                weight_transpose)
            rank_pair_scores.append(score)
        our_pair_scores[pair] = np.mean(rank_pair_scores)

    # Answer MaxDiff questions using the ranks from the previous step
    correct_most = 0
    correct_least = 0
    total = 0

    for i, question in enumerate(test_questions):
        question_pairs_scores = []

        for question_pair in question:
            score = our_pair_scores[question_pair]
            question_pairs_scores.append(score)

        our_answer_most = question[np.argmax(question_pairs_scores)]
        our_answer_least = question[np.argmin(question_pairs_scores)]

        votes_guess_least = pairqnum2least[(i, our_answer_least)]
        votes_guess_most = pairqnum2most[(i, our_answer_most)]

        max_votes_least = 0
        max_votes_most = 0
        for question_pair in question:
            num_votes_least = pairqnum2least[(i, question_pair)]
            num_votes_most = pairqnum2most[(i, question_pair)]
            if num_votes_least > max_votes_least:
                max_votes_least = num_votes_least
            if num_votes_most > max_votes_most:
                max_votes_most = num_votes_most

        # a guess is correct if it got the same number of votes as the most frequent turkers' answer
        if votes_guess_least == max_votes_least:
            correct_least += 1
        if votes_guess_most == max_votes_most:
            correct_most += 1
        total += 1

    # Compute Spearman correlation of our ranks and MT ranks
    our_semeval_scores = [
        score for pair, score in sorted(our_pair_scores.items())
    ]
    turk_semeval_scores = [score for pair, score in turk_rank]
    spearman = spearmanr(our_semeval_scores, turk_semeval_scores)[0]
    spearman_results = confidence_interval(spearman, total)

    # Compute an accuracy score on MaxDiff questions
    maxdiff = (correct_least + correct_most) / (2 * total)
    low_maxdiff, high_maxdiff = proportion_confint(
        (correct_least + correct_most), (2 * total))
    maxdiff_results = pd.Series([maxdiff, low_maxdiff, high_maxdiff],
                                index=['acc', 'low', 'high'])

    return [maxdiff_results, spearman_results]
Пример #27
0
 def text_to_vector(self, language, text):
     tokens = wordfreq.tokenize(text, language)
     weighted_terms = [(standardized_uri(language, token), 1.)
                       for token in tokens]
     return self.get_vector(weighted_terms, include_neighbors=False)
Пример #28
0
 def text_to_vector(self, language, text):
     """Used in Story Cloze Test to create a vector for text """
     tokens = wordfreq.tokenize(text, language)
     weighted_terms = [(standardized_uri(language, token), 1.)
                       for token in tokens]
     return self.get_vector(weighted_terms, include_neighbors=False)
Пример #29
0
def eval_google_analogies(vectors,
                          subset='semantic',
                          vocab_size=200000,
                          verbose=False):
    """
    Evaluate the Google Research analogies, released by Mikolov et al. along
    with word2vec.

    These analogies come in two flavors: semantic and syntactic. Numberbatch
    is intended to be a semantic space, so we focus on semantic analogies.

    The syntactic analogies are about whether you can inflect or conjugate a
    particular word. The semantic analogies are about whether you can sort
    words by their gender, and about geographic trivia.

    I (Rob) think this data set is not very representative, but evaluating
    against it is all the rage.

    These analogies are not multiple-choice; instead, you're supposed to pick
    the best match out of your vector space's entire vocabulary, excluding the
    three words used in the prompt. The vocabulary size can matter a lot: Set
    it too high and you'll get low-frequency words that the data set wasn't
    looking for as answers. Set it too low and the correct answers won't be
    in the vocabulary.

    Set vocab_size='cheat' to see the results for an unrealistically optimal
    vocabulary (the vocabulary of the set of answer words).
    """
    filename = get_support_data_filename(
        'google-analogies/{}-words.txt'.format(subset))
    quads = read_google_analogies(filename)
    if vocab_size == 'cheat':
        vocab = [
            standardized_uri('en', word)
            for word in sorted(set([quad[3] for quad in quads]))
        ]
    else:
        vocab = [
            standardized_uri('en', word)
            for word in wordfreq.top_n_list('en', vocab_size)
        ]
    vecs = np.vstack([vectors.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        result = best_analogy_3cosmul(vectors, tframe, *prompt)
        if result == answer:
            correct += 1
        else:
            if verbose and result not in seen_mistakes:
                print("%s : %s :: %s : [%s] (should be %s)" %
                      (quad[0], quad[1], quad[2], result, answer))
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    result = pd.Series([correct / total, low, high],
                       index=['acc', 'low', 'high'])
    if verbose:
        print(result)
    return result
Пример #30
0
def tag_en(word):
    return standardized_uri('en', word)
Пример #31
0
 def text_to_vector(self, language, text):
     tokens = wordfreq.tokenize(text, language)
     weighted_terms = [(standardized_uri(language, token), 1.0) for token in tokens]
     return self.get_vector(weighted_terms, include_neighbors=False)