def spearman_evaluate(vectors, standard, language='en', verbose=0): """ Tests assoc_space's ability to recognize word correlation. This function computes the spearman correlation between assoc_space's reported word correlation and the expected word correlation according to 'standard'. """ gold_scores = [] our_scores = [] for term1, term2, gold_score in standard: uri1 = standardized_uri(language, term1) uri2 = standardized_uri(language, term2) if isinstance(vectors, VectorSpaceWrapper): our_score = vectors.get_similarity(uri1, uri2) else: our_score = cosine_similarity(get_vector(vectors, uri1), get_vector(vectors, uri2)) if verbose > 1: print('%s\t%s\t%3.3f\t%3.3f' % (term1, term2, gold_score, our_score)) gold_scores.append(gold_score) our_scores.append(our_score) correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0] if verbose: print("Spearman correlation: %s" % (correlation,)) return confidence_interval(correlation, len(gold_scores))
def spearman_evaluate(vectors, standard, language='en', verbose=0): """ Tests assoc_space's ability to recognize word correlation. This function computes the spearman correlation between assoc_space's reported word correlation and the expected word correlation according to 'standard'. """ gold_scores = [] our_scores = [] for term1, term2, gold_score in standard: uri1 = standardized_uri(language, term1) uri2 = standardized_uri(language, term2) if isinstance(vectors, VectorSpaceWrapper): our_score = vectors.get_similarity(uri1, uri2) else: our_score = cosine_similarity(get_vector(vectors, uri1), get_vector(vectors, uri2)) if verbose > 1: print('%s\t%s\t%3.3f\t%3.3f' % (term1, term2, gold_score, our_score)) gold_scores.append(gold_score) our_scores.append(our_score) correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0] if verbose: print("Spearman correlation: %s" % (correlation, )) return confidence_interval(correlation, len(gold_scores))
def measure_bias(frame): """ Return a DataFrame that measures biases in a semantic space, on four data sets: - Gender - Fine-grained ethnicity - Coarse-grained ethnicity - Religious beliefs """ vsw = VectorSpaceWrapper(frame=frame) vsw.load() gender_binary_axis = normalize_vec( get_category_axis(frame, FEMALE_WORDS) - get_category_axis(frame, MALE_WORDS)) gender_bias_numbers = [] for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS: female_biased_uri = standardized_uri('en', female_biased_word) male_biased_uri = standardized_uri('en', male_biased_word) diff = normalize_vec( vsw.get_vector(female_biased_uri) - vsw.get_vector(male_biased_uri)).dot(gender_binary_axis) gender_bias_numbers.append(diff) mean = np.mean(gender_bias_numbers) sem = scipy.stats.sem(gender_bias_numbers) gender_bias = pd.Series([mean, mean - sem * 2, mean + sem * 2], index=['bias', 'low', 'high']) stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_ETHNICITY) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) fine_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = get_vocabulary_vectors(frame, COARSE_ETHNICITY_TERMS) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) coarse_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = pd.DataFrame( np.vstack( [get_category_axis(frame, names) for names in ETHNIC_NAME_SETS])) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) name_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_BELIEF) stereotype_vecs_2 = get_vocabulary_vectors(frame, BELIEF_STEREOTYPE_TERMS) belief_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) return pd.DataFrame({ 'gender': gender_bias, 'ethnicity-fine': fine_ethnic_bias, 'ethnicity-coarse': coarse_ethnic_bias, 'ethnicity-names': name_ethnic_bias, 'beliefs': belief_bias }).T
def read_bats(category): """ Read BATS dataset pairs for a specific category. Turn them into questions. For some questions, BATS contains multiple answers. For example, the answer to an analogy question Nicaragua:Spanish::Switzerland:? could be German, French, or Italian. These will all be supplied as a list if they are an answer (b2). However, if they are a part of a question (b1), only the first one will be used. """ filename = 'bats/{}.txt'.format(category) pairs = [] with open(get_support_data_filename(filename)) as file: for line in file: if '\t' in line: left, right = line.lower().split('\t') else: left, right = line.lower().split() right = right.strip() if '/' in right: right = [i.strip() for i in right.split('/')] else: right = [i.strip() for i in right.split(',')] pairs.append([left, right]) quads = [] for i in range(len(pairs)): first_pair = pairs[i] first_pair[1] = first_pair[1][ 0] # select only one term for b1, even if more may be available second_pairs = [pair for j, pair in enumerate(pairs) if j != i] for second_pair in second_pairs: quad = [] # the first three elements of a quad are the two terms in first_pair and the first # term of the second_pair quad.extend([ standardized_uri('en', term) for term in first_pair + second_pair[:1] ]) # if the second element of the second pair (b2) is a list, it means there are multiple # correct answers for b2. We want to keep all of them. if isinstance(second_pair[1], list): quad.append( [standardized_uri('en', term) for term in second_pair[1]]) else: quad.append(standardized_uri('en', second_pair[1])) quads.append(quad) return quads
def read_turney_analogies(filename): """ Read Turney and Littman's dataset of SAT analogy questions. This data requires permission to redistribute, so you have to ask Peter Turney for the file. """ questions = [] question_lines = [] with open(filename, encoding='utf-8') as file: for line in file: line = line.rstrip() if line and not line.startswith('#'): if len(line) == 1: # A single letter on a line indicates the answer to a question. answer_index = ord(line) - ord('a') # Line 0 is a header we can discard. raw_pairs = [qline.split(' ')[:2] for qline in question_lines[1:]] concept_pairs = [tuple(standardized_uri('en', term) for term in pair) for pair in raw_pairs] # The first of the pairs we got is the prompt pair. The others are # answers (a) through (e). questions.append((concept_pairs[0], concept_pairs[1:], answer_index)) question_lines.clear() else: question_lines.append(line) return questions
def eval_analogies(frame): filename = get_support_data_filename('google-analogies/questions-words.txt') quads = read_google_analogies(filename) vocab = [ standardized_uri('en', word) for word in wordfreq.top_n_list('en', 200000) ] wrap = VectorSpaceWrapper(frame=frame) vecs = np.vstack([wrap.get_vector(word) for word in vocab]) tframe = pd.DataFrame(vecs, index=vocab) total = 0 correct = 0 seen_mistakes = set() for quad in quads: prompt = quad[:3] answer = quad[3] vector = analogy_func(frame, *prompt) similar = similar_to_vec(tframe, vector) result = None for match in similar.index: if match not in prompt: result = match break if result == answer: correct += 1 else: if result not in seen_mistakes: print( "%s : %s :: %s : [%s] (should be %s)" % (quad[0], quad[1], quad[2], result, answer) ) seen_mistakes.add(result) total += 1 low, high = proportion_confint(correct, total) return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
def get_vocabulary_vectors(frame, vocab): """ Given a vocabulary (as a list of English terms), get a sub-frame of the given DataFrame containing just the known vectors for that vocabulary. """ uris = [standardized_uri('en', term) for term in vocab] return frame.reindex(uris).dropna()
def read_turney_analogies(filename): """ Read Turney and Littman's dataset of SAT analogy questions. This data requires permission to redistribute, so you have to ask Peter Turney for the file. """ questions = [] question_lines = [] with open(filename, encoding='utf-8') as file: for line in file: line = line.rstrip() if line and not line.startswith('#'): if len(line) == 1: # A single letter on a line indicates the answer to a question. answer_index = ord(line) - ord('a') # Line 0 is a header we can discard. raw_pairs = [ qline.split(' ')[:2] for qline in question_lines[1:] ] concept_pairs = [ tuple(standardized_uri('en', term) for term in pair) for pair in raw_pairs ] # The first of the pairs we got is the prompt pair. The others are # answers (a) through (e). questions.append( (concept_pairs[0], concept_pairs[1:], answer_index)) question_lines.clear() else: question_lines.append(line) return questions
def get_category_axis(frame, category_examples): """ Get a vector representing the average of several example terms, where the terms are specified as plain English text. """ return get_weighted_vector(frame, [(standardized_uri('en', term), 1.) for term in category_examples])
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms (e.g. of the form 'en/term') to one whose row labels are standardized ConceptNet URIs (e.g. of the form '/c/en/term'; and with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. """ # Check for en/term format we use to train fastText on OpenSubtitles data if all(label.count('/') == 1 for label in frame.index[0:5]): tuples = [label.partition('/') for label in frame.index] frame.index = [ uri_prefix(standardized_uri(language, text)) for language, _slash, text in tuples ] # Re-label the DataFrame with standardized, non-unique row labels frame.index = [ uri_prefix(standardized_uri(language, label)) for label in frame.index ] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange(1, nrows + 1) label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[ lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort_values(inplace=True, ascending=False) result = scaled.loc[combined_weights.index] return result
def read_google_analogies(filename): """ Read the 'questions-words.txt' file that comes with the word2vec package. """ quads = [[ standardized_uri('en', term) for term in line.rstrip().split(' ') ] for line in open(filename, encoding='utf-8') if not line.startswith(':')] return quads
def text_to_vector(self, language, text): """ Used in Story Cloze Test to create a vector for text. """ tokens = wordfreq.simple_tokenize(text) weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.) for token in tokens] return self.get_vector(weighted_terms, oov_vector=False)
def de_bias_category(frame, category_examples, bias_examples): """ Remove correlations between a class of words that should have biases removed (category_examples) and a set of words reflecting those biases (bias_examples). For example, the `category_examples` may be ethnicities, and `bias_examples` may be stereotypes about them. The check for whether a word should be de-biased works like `de_bias_binary`, where the category words are positive examples and the bias words are negative examples (because the words that define the bias presumably should not be de-biased). The words that should be de-biased will have their correlations with each of the bias words removed. """ # Make an SVM that distinguishes words that are in the category to be # de-biased from words that are not. category_predictor = two_class_svm(frame, category_examples, bias_examples) # Predict the probability of each word in the vocabulary being in the # category. This is done on shards, to reduce peak memory consumption. applicability = np.zeros(shape=(len(frame), ), dtype=np.float32) for shard_start, shard_end in make_shard_endpoints(len(frame)): applicability[ shard_start:shard_end] = category_predictor.predict_proba( frame[shard_start:shard_end])[:, 1] del category_predictor # Make a matrix of vectors representing the correlations to remove. vocab = [standardized_uri('en', term) for term in bias_examples] components_to_reject = frame.reindex(vocab).dropna().values # Make a modified version of the space that projects the bias vectors to 0. # Then weight each row of that space by "applicability", the probability # that each row should be de-biased. This is also done on shards. for shard_start, shard_end in make_shard_endpoints(len(frame)): shard_len = shard_end - shard_start modified_component = reject_subspace( frame[shard_start:shard_end], components_to_reject).mul(applicability[shard_start:shard_end], axis=0).values # Make another component representing the vectors that should not be # de-biased: the original space times (1 - applicability). np.multiply(1 - applicability[shard_start:shard_end].reshape( (shard_len, 1)), frame.values[shard_start:shard_end, :], out=frame.values[shard_start:shard_end, :]) # The sum of these two components is the de-biased space, where # de-biasing applies to each row proportional to its applicability. np.add(frame.values[shard_start:shard_end, :], modified_component, out=frame.values[shard_start:shard_end, :]) del modified_component # L_2-normalize the resulting rows in-place. normalize(frame.values, norm='l2', copy=False)
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms (e.g. of the form 'en/term') to one whose row labels are standardized ConceptNet URIs (e.g. of the form '/c/en/term'; and with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. """ # Check for en/term format we use to train fastText on OpenSubtitles data if all(label.count('/') == 1 for label in frame.index[0:5]): tuples = [label.partition('/') for label in frame.index] frame.index = [ uri_prefix(standardized_uri(language, text)) for language, _slash, text in tuples ] # Re-label the DataFrame with standardized, non-unique row labels frame.index = [ uri_prefix(standardized_uri(language, label)) for label in frame.index ] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange(1, nrows + 1) label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort_values(inplace=True, ascending=False) result = scaled.loc[combined_weights.index] return result
def text_to_vector(self, language, text): """ Used in Story Cloze Test to create a vector for text. """ tokens = wordfreq.tokenize(text, language) weighted_terms = [ (uri_prefix(standardized_uri(language, token)), 1.) for token in tokens ] return self.get_vector(weighted_terms, oov_vector=False)
def get_category_axis(frame, category_examples): """ Get a vector representing the average of several example terms, where the terms are specified as plain English text. """ return get_weighted_vector( frame, [(standardized_uri('en', term), 1.) for term in category_examples] )
def read_google_analogies(filename): """ Read the 'questions-words.txt' file that comes with the word2vec package. """ quads = [ [standardized_uri('en', term) for term in line.rstrip().split(' ')] for line in open(filename, encoding='utf-8') if not line.startswith(':') ] return quads
def de_bias_category(frame, category_examples, bias_examples): """ Remove correlations between a class of words that should have biases removed (category_examples) and a set of words reflecting those biases (bias_examples). For example, the `category_examples` may be ethnicities, and `bias_examples` may be stereotypes about them. The check for whether a word should be de-biased works like `de_bias_binary`, where the category words are positive examples and the bias words are negative examples (because the words that define the bias presumably should not be de-biased). The words that should be de-biased will have their correlations with each of the bias words removed. """ # Make an SVM that distinguishes words that are in the category to be # de-biased from words that are not. category_predictor = two_class_svm(frame, category_examples, bias_examples) # Predict the probability of each word in the vocabulary being in the # category. This is done on shards, to reduce peak memory consumption. applicability = np.zeros(shape=(len(frame),), dtype=np.float32) for shard_start, shard_end in make_shard_endpoints(len(frame)): applicability[shard_start:shard_end] = category_predictor.predict_proba( frame[shard_start:shard_end])[:, 1] del category_predictor # Make a matrix of vectors representing the correlations to remove. vocab = [ standardized_uri('en', term) for term in bias_examples ] components_to_reject = frame.reindex(vocab).dropna().values # Make a modified version of the space that projects the bias vectors to 0. # Then weight each row of that space by "applicability", the probability # that each row should be de-biased. This is also done on shards. modified_component = np.zeros(shape=frame.values.shape, dtype=np.float32) for shard_start, shard_end in make_shard_endpoints(len(frame)): modified_component[shard_start:shard_end, :] = \ reject_subspace(frame[shard_start:shard_end], components_to_reject).mul( applicability[shard_start:shard_end], axis=0).values del components_to_reject # Make another component representing the vectors that should not be # de-biased: the original space times (1 - applicability). np.multiply(1 - applicability.reshape((len(frame), 1)), frame.values, out=frame.values) # The sum of these two components is the de-biased space, where de-biasing # applies to each row proportional to its applicability. np.add(frame.values, modified_component, out=frame.values) del modified_component # L_2-normalize the resulting rows in-place. normalize(frame.values, norm='l2', copy=False)
def test_load_vectors(): vectors = ai._load_vectors() ok_(vectors.frame.index[0].startswith('/c/en/')) # Test we have vectors for all Codenames words wordlist = [ standardized_uri('en', line.strip()) for line in open( resource_filename('codenames', 'data/codenames-words.txt')) ] for word in wordlist: ok_(word in vectors.frame.index)
def choose_vocab(quads, vocab_size): """ Google and Bats analogies are not multiple-choice; instead, you're supposed to pick the best match out of your vector space's entire vocabulary, excluding the three words used in the prompt. The vocabulary size can matter a lot: Set it too high and you'll get low-frequency words that the data set wasn't looking for as answers. Set it too low and the correct answers won't be in the vocabulary. Set vocab_size='cheat' to see the results for an unrealistically optimal vocabulary (the vocabulary of the set of answer words). """ if vocab_size == 'cheat': vocab = [ standardized_uri('en', word) for word in sorted(set([quad[3] for quad in quads])) ] else: vocab = [ standardized_uri('en', word) for word in wordfreq.top_n_list('en', vocab_size) ] return vocab
def _load_vectors(): frame = load_hdf(resource_filename('codenames', 'data/mini.h5')) selections = [ label for label in frame.index if label.startswith('/c/en/') and '_' not in label and '#' not in label and wordfreq.zipf_frequency(label[6:], 'en') > 3.0 ] # Make sure all the words in Codenames are represented wordlist = [ standardized_uri('en', line.strip()) for line in open( resource_filename('codenames', 'data/codenames-words.txt')) ] additions = [word for word in wordlist if word not in selections] selections += additions frame = l2_normalize_rows(frame.loc[selections].astype('f')) return VectorSpaceWrapper(frame=frame)
def de_bias_category(frame, category_examples, bias_examples): """ Remove correlations between a class of words that should have biases removed (category_examples) and a set of words reflecting those biases (bias_examples). For example, the `category_examples` may be ethnicities, and `bias_examples` may be stereotypes about them. The check for whether a word should be de-biased works like `de_bias_binary`, where the category words are positive examples and the bias words are negative examples (because the words that define the bias presumably should not be de-biased). The words that should be de-biased will have their correlations with each of the bias words removed. """ # Make an SVM that distinguishes words that are in the category to be # de-biased from words that are not. category_predictor = two_class_svm(frame, category_examples, bias_examples) # Predict the probability of each word in the vocabulary being in the # category. applicability = category_predictor.predict_proba(frame)[:, 1] del category_predictor # Make a matrix of vectors representing the correlations to remove. vocab = [standardized_uri('en', term) for term in bias_examples] components_to_reject = frame.reindex(vocab).dropna().values # Make a modified version of the space that projects the bias vectors to 0. # Then weight each row of that space by "applicability", the probability # that each row should be de-biased. modified_component = reject_subspace(frame, components_to_reject).mul( applicability, axis=0) del components_to_reject # Make another component representing the vectors that should not be # de-biased: the original space times (1 - applicability). result = frame.mul(1 - applicability, axis=0) # The sum of these two components is the de-biased space, where de-biasing # applies to each row proportional to its applicability. np.add(result.values, modified_component.values, out=result.values) del modified_component # L_2-normalize the resulting rows in-place. normalize(result.values, norm='l2', copy=False) return result
def text_to_vector(self, language, text): # TODO: docstring -- is this only used for Story Cloze Test? tokens = wordfreq.tokenize(text, language) weighted_terms = [(standardized_uri(language, token), 1.) for token in tokens] return self.get_vector(weighted_terms, include_neighbors=False)
def eval_semeval2012_analogies(vectors, weight_direct, weight_transpose, subset, subclass): """ For a set of test pairs: * Compute a Spearman correlation coefficient between the ranks produced by vectors and gold ranks. * Compute an accuracy score of answering MaxDiff questions. """ train_pairs = read_train_pairs_semeval2012(subset, subclass) test_questions = read_test_questions_semeval2012(subset, subclass) pairqnum2least, pairqnum2most = read_turk_answers_semeval2012( subset, subclass, test_questions) turk_rank = read_turk_ranks_semeval2012(subset, subclass) pairs_to_rank = [pair for pair, score in turk_rank] # Assign a score to each pair, according to pairwise_analogy_func our_pair_scores = {} for pair in pairs_to_rank: rank_pair_scores = [] for train_pair in train_pairs: pair_to_rank = pair.strip().replace('"', '').split(':') score = pairwise_analogy_func( vectors, standardized_uri('en', train_pair[0]), standardized_uri('en', train_pair[1]), standardized_uri('en', pair_to_rank[0]), standardized_uri('en', pair_to_rank[1]), weight_direct, weight_transpose) rank_pair_scores.append(score) our_pair_scores[pair] = np.mean(rank_pair_scores) # Answer MaxDiff questions using the ranks from the previous step correct_most = 0 correct_least = 0 total = 0 for i, question in enumerate(test_questions): question_pairs_scores = [] for question_pair in question: score = our_pair_scores[question_pair] question_pairs_scores.append(score) our_answer_most = question[np.argmax(question_pairs_scores)] our_answer_least = question[np.argmin(question_pairs_scores)] votes_guess_least = pairqnum2least[(i, our_answer_least)] votes_guess_most = pairqnum2most[(i, our_answer_most)] max_votes_least = 0 max_votes_most = 0 for question_pair in question: num_votes_least = pairqnum2least[(i, question_pair)] num_votes_most = pairqnum2most[(i, question_pair)] if num_votes_least > max_votes_least: max_votes_least = num_votes_least if num_votes_most > max_votes_most: max_votes_most = num_votes_most # a guess is correct if it got the same number of votes as the most frequent turkers' answer if votes_guess_least == max_votes_least: correct_least += 1 if votes_guess_most == max_votes_most: correct_most += 1 total += 1 # Compute Spearman correlation of our ranks and MT ranks our_semeval_scores = [ score for pair, score in sorted(our_pair_scores.items()) ] turk_semeval_scores = [score for pair, score in turk_rank] spearman = spearmanr(our_semeval_scores, turk_semeval_scores)[0] spearman_results = confidence_interval(spearman, total) # Compute an accuracy score on MaxDiff questions maxdiff = (correct_least + correct_most) / (2 * total) low_maxdiff, high_maxdiff = proportion_confint( (correct_least + correct_most), (2 * total)) maxdiff_results = pd.Series([maxdiff, low_maxdiff, high_maxdiff], index=['acc', 'low', 'high']) return [maxdiff_results, spearman_results]
def text_to_vector(self, language, text): tokens = wordfreq.tokenize(text, language) weighted_terms = [(standardized_uri(language, token), 1.) for token in tokens] return self.get_vector(weighted_terms, include_neighbors=False)
def text_to_vector(self, language, text): """Used in Story Cloze Test to create a vector for text """ tokens = wordfreq.tokenize(text, language) weighted_terms = [(standardized_uri(language, token), 1.) for token in tokens] return self.get_vector(weighted_terms, include_neighbors=False)
def eval_google_analogies(vectors, subset='semantic', vocab_size=200000, verbose=False): """ Evaluate the Google Research analogies, released by Mikolov et al. along with word2vec. These analogies come in two flavors: semantic and syntactic. Numberbatch is intended to be a semantic space, so we focus on semantic analogies. The syntactic analogies are about whether you can inflect or conjugate a particular word. The semantic analogies are about whether you can sort words by their gender, and about geographic trivia. I (Rob) think this data set is not very representative, but evaluating against it is all the rage. These analogies are not multiple-choice; instead, you're supposed to pick the best match out of your vector space's entire vocabulary, excluding the three words used in the prompt. The vocabulary size can matter a lot: Set it too high and you'll get low-frequency words that the data set wasn't looking for as answers. Set it too low and the correct answers won't be in the vocabulary. Set vocab_size='cheat' to see the results for an unrealistically optimal vocabulary (the vocabulary of the set of answer words). """ filename = get_support_data_filename( 'google-analogies/{}-words.txt'.format(subset)) quads = read_google_analogies(filename) if vocab_size == 'cheat': vocab = [ standardized_uri('en', word) for word in sorted(set([quad[3] for quad in quads])) ] else: vocab = [ standardized_uri('en', word) for word in wordfreq.top_n_list('en', vocab_size) ] vecs = np.vstack([vectors.get_vector(word) for word in vocab]) tframe = pd.DataFrame(vecs, index=vocab) total = 0 correct = 0 seen_mistakes = set() for quad in quads: prompt = quad[:3] answer = quad[3] result = best_analogy_3cosmul(vectors, tframe, *prompt) if result == answer: correct += 1 else: if verbose and result not in seen_mistakes: print("%s : %s :: %s : [%s] (should be %s)" % (quad[0], quad[1], quad[2], result, answer)) seen_mistakes.add(result) total += 1 low, high = proportion_confint(correct, total) result = pd.Series([correct / total, low, high], index=['acc', 'low', 'high']) if verbose: print(result) return result
def tag_en(word): return standardized_uri('en', word)
def text_to_vector(self, language, text): tokens = wordfreq.tokenize(text, language) weighted_terms = [(standardized_uri(language, token), 1.0) for token in tokens] return self.get_vector(weighted_terms, include_neighbors=False)