def make_replacements_faster(small_frame, big_frame, tree_depth=1000, lang='en', verbose=False): """ Create a replacements dictionary to map terms only present in a big frame to the closest term in a small_frame. This is a faster than make_replacements(), because it uses a fast implementation of the approximate nearest neighbor algorithm. tree_depth=1000 provides a good balance of speed and accuracy. """ intersected = big_frame.reindex(small_frame.index).dropna() index, index_map = build_annoy_tree(intersected, tree_depth) replacements = {} for term in big_frame.index: if term not in small_frame.index and not term.startswith('/x/'): most_similar_index = index.get_nns_by_vector( big_frame.loc[term], 1)[0] most_similar = index_map[most_similar_index] similarity = cosine_similarity( get_vector(big_frame, term, lang), get_vector(small_frame, most_similar, lang)) replacements[term] = [most_similar, round(similarity, 2)] if verbose and not (len(replacements) % 20): print('{} ==> {}, {}'.format(term, most_similar, similarity)) return replacements
def spearman_evaluate(vectors, standard, language='en', verbose=0): """ Tests assoc_space's ability to recognize word correlation. This function computes the spearman correlation between assoc_space's reported word correlation and the expected word correlation according to 'standard'. """ gold_scores = [] our_scores = [] for term1, term2, gold_score in standard: uri1 = standardized_uri(language, term1) uri2 = standardized_uri(language, term2) if isinstance(vectors, VectorSpaceWrapper): our_score = vectors.get_similarity(uri1, uri2) else: our_score = cosine_similarity(get_vector(vectors, uri1), get_vector(vectors, uri2)) if verbose > 1: print('%s\t%s\t%3.3f\t%3.3f' % (term1, term2, gold_score, our_score)) gold_scores.append(gold_score) our_scores.append(our_score) correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0] if verbose: print("Spearman correlation: %s" % (correlation, )) return confidence_interval(correlation, len(gold_scores))
def spearman_evaluate(vectors, standard, language='en', verbose=0): """ Tests assoc_space's ability to recognize word correlation. This function computes the spearman correlation between assoc_space's reported word correlation and the expected word correlation according to 'standard'. """ gold_scores = [] our_scores = [] for term1, term2, gold_score in standard: uri1 = standardized_uri(language, term1) uri2 = standardized_uri(language, term2) if isinstance(vectors, VectorSpaceWrapper): our_score = vectors.get_similarity(uri1, uri2) else: our_score = cosine_similarity(get_vector(vectors, uri1), get_vector(vectors, uri2)) if verbose > 1: print('%s\t%s\t%3.3f\t%3.3f' % (term1, term2, gold_score, our_score)) gold_scores.append(gold_score) our_scores.append(our_score) correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0] if verbose: print("Spearman correlation: %s" % (correlation,)) return confidence_interval(correlation, len(gold_scores))
def measure_bias(frame): """ Return a DataFrame that measures biases in a semantic space, on four data sets: - Gender - Fine-grained ethnicity - Coarse-grained ethnicity - Religious beliefs """ gender_binary_axis = normalize_vec( get_category_axis(frame, FEMALE_WORDS) - get_category_axis(frame, MALE_WORDS)) gender_bias_numbers = [] for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS: female_biased_uri = standardized_uri('en', female_biased_word) male_biased_uri = standardized_uri('en', male_biased_word) diff = normalize_vec( get_vector(frame, female_biased_uri) - get_vector(frame, male_biased_uri)).dot(gender_binary_axis) gender_bias_numbers.append(diff) mean = np.mean(gender_bias_numbers) sem = scipy.stats.sem(gender_bias_numbers) gender_bias = pd.Series([mean, mean - sem * 2, mean + sem * 2], index=['bias', 'low', 'high']) stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_ETHNICITY) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) fine_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = get_vocabulary_vectors(frame, COARSE_ETHNICITY_TERMS) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) coarse_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = pd.DataFrame( np.vstack( [get_category_axis(frame, names) for names in ETHNIC_NAME_SETS])) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) name_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_BELIEF) stereotype_vecs_2 = get_vocabulary_vectors(frame, BELIEF_STEREOTYPE_TERMS) belief_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) return pd.DataFrame({ 'gender': gender_bias, 'ethnicity-fine': fine_ethnic_bias, 'ethnicity-coarse': coarse_ethnic_bias, 'ethnicity-names': name_ethnic_bias, 'beliefs': belief_bias }).T
def test_get_vector(frame=None): """ Check if vectors.get_vector() returns the same vector given labels that are shaped in a different way. """ if frame: vectors = load_any_embeddings(frame) ok_( get_vector(vectors, '/c/en/cat').equals(get_vector(vectors, 'cat', 'en'))) vectors = load_any_embeddings(DATA + '/vectors/glove12-840B.h5') ok_( get_vector(vectors, '/c/en/cat').equals(get_vector(vectors, 'cat', 'en')))
def test_get_vector(): ok_(get_vector(TEST_FRAME, '/c/en/cat').equals(get_vector(TEST_FRAME, 'cat', 'en')))
def test_get_vector(simple_frame): assert get_vector(simple_frame, '/c/en/cat').equals(get_vector(simple_frame, 'cat', 'en'))
def analogy_func(frame, a1, b1, a2): return get_vector(frame, b1) - get_vector(frame, a1) + get_vector(frame, a2)