def measure_bias(frame): """ Return a DataFrame that measures biases in a semantic space, on four data sets: - Gender - Fine-grained ethnicity - Coarse-grained ethnicity - Religious beliefs """ vsw = VectorSpaceWrapper(frame=frame) vsw.load() gender_binary_axis = normalize_vec( get_category_axis(frame, FEMALE_WORDS) - get_category_axis(frame, MALE_WORDS)) gender_bias_numbers = [] for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS: female_biased_uri = standardized_uri('en', female_biased_word) male_biased_uri = standardized_uri('en', male_biased_word) diff = normalize_vec( vsw.get_vector(female_biased_uri) - vsw.get_vector(male_biased_uri)).dot(gender_binary_axis) gender_bias_numbers.append(diff) mean = np.mean(gender_bias_numbers) sem = scipy.stats.sem(gender_bias_numbers) gender_bias = pd.Series([mean, mean - sem * 2, mean + sem * 2], index=['bias', 'low', 'high']) stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_ETHNICITY) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) fine_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = get_vocabulary_vectors(frame, COARSE_ETHNICITY_TERMS) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) coarse_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = pd.DataFrame( np.vstack( [get_category_axis(frame, names) for names in ETHNIC_NAME_SETS])) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) name_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_BELIEF) stereotype_vecs_2 = get_vocabulary_vectors(frame, BELIEF_STEREOTYPE_TERMS) belief_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) return pd.DataFrame({ 'gender': gender_bias, 'ethnicity-fine': fine_ethnic_bias, 'ethnicity-coarse': coarse_ethnic_bias, 'ethnicity-names': name_ethnic_bias, 'beliefs': belief_bias }).T
def get_vector(self, query, oov_vector=True): """ Given one of the possible types of queries (see `similar_terms`), make a vector to look up from it. If there are 5 or fewer terms involved and `oov_vector=True`, this will allow expanded_vector to use an out-of-vocab strategy to find missing terms. """ self.load() if isinstance(query, np.ndarray): return query elif isinstance(query, pd.Series) or isinstance(query, dict): terms = list(query.items()) elif isinstance(query, pd.DataFrame): terms = list(query.to_records()) elif isinstance(query, str): terms = [(query, 1.)] elif isinstance(query, list): terms = query else: raise ValueError("Can't make a query out of type %s" % type(query)) cache_key = tuple(terms + [oov_vector]) if cache_key in self.cache: return self.cache[cache_key] oov_vector = oov_vector and (len(terms) <= 5) vec = self.expanded_vector(terms, oov_vector=oov_vector) self.cache[cache_key] = normalize_vec(vec) return self.cache[cache_key]
def get_vector(self, query, include_neighbors=True): """ Given one of the possible types of queries (see `similar_terms`), make a vector to look up from it. If there are 5 or fewer terms involved and `include_neighbors=True`, this will allow expanded_vector to look up neighboring terms in ConceptNet. """ self.load() if isinstance(query, np.ndarray): return query elif isinstance(query, pd.Series) or isinstance(query, dict): terms = list(query.items()) elif isinstance(query, pd.DataFrame): terms = list(query.to_records()) elif isinstance(query, str): terms = [(query, 1.)] elif isinstance(query, list): terms = query else: raise ValueError("Can't make a query out of type %s" % type(query)) cache_key = tuple(terms + [include_neighbors]) if cache_key in self.cache: return self.cache[cache_key] include_neighbors = include_neighbors and (len(terms) <= 5) vec = self.expanded_vector(terms, include_neighbors=include_neighbors) self.cache[cache_key] = normalize_vec(vec) return self.cache[cache_key]
def get_vector(self, query, oov_vector=True): """ Given one of the possible types of queries (see `similar_terms`), make a vector to look up from it. If there are 5 or fewer terms involved and `oov_vector=True`, this will allow expanded_vector to look up neighboring terms in ConceptNet. """ self.load() if isinstance(query, np.ndarray): return query elif isinstance(query, pd.Series) or isinstance(query, dict): terms = list(query.items()) elif isinstance(query, pd.DataFrame): terms = list(query.to_records()) elif isinstance(query, str): terms = [(query, 1.)] elif isinstance(query, list): terms = query else: raise ValueError("Can't make a query out of type %s" % type(query)) cache_key = tuple(terms + [oov_vector]) if cache_key in self.cache: return self.cache[cache_key] oov_vector = oov_vector and (len(terms) <= 5) vec = self.expanded_vector(terms, oov_vector=oov_vector) self.cache[cache_key] = normalize_vec(vec) return self.cache[cache_key]
def get_weighted_vector(frame, weighted_terms): """ Given a list of (term, weight) pairs, get a unit vector corresponding to the weighted average of those term vectors. A simplified version of VectorSpaceWrapper.get_vector(). """ total = frame.iloc[0] * 0. for term, weight in weighted_terms: if term in frame.index: vec = frame.loc[term] total += vec * weight return normalize_vec(total)
def reject_subspace(frame, vecs): """ Return a modification of the vector space `frame` where none of its rows have any correlation with any rows of `vecs`, by subtracting the outer product of `frame` with each normalized row of `vecs`. """ current_array = frame.copy() for vec in vecs: vec = normalize_vec(vec) projection = current_array.dot(vec) current_array -= np.outer(projection, vec) return l2_normalize_rows(current_array, offset=1e-9)
def get_weighted_vector(frame, weighted_terms): """ Given a list of (term, weight) pairs, get a unit vector corresponding to the weighted average of those term vectors. A simplified version of VectorSpaceWrapper.get_vector(). """ total = frame.iloc[0] * 0. for term, weight in weighted_terms: if term in frame.index: vec = frame.loc[term] total += vec * weight return normalize_vec(total)
def reject_subspace(frame, vecs): """ Return a modification of the vector space `frame` where none of its rows have any correlation with any rows of `vecs`, by subtracting the outer product of `frame` with each normalized row of `vecs`. """ current_array = frame.copy().values for vec in vecs: vec = normalize_vec(vec) projection = current_array.dot(vec) np.subtract(current_array, np.outer(projection, vec), out=current_array) normalize(current_array, norm='l2', copy=False) current_array = pd.DataFrame(current_array, index=frame.index) current_array.fillna(0, inplace=True) return current_array
def reject_subspace(frame, vecs): """ Return a modification of the vector space `frame` where none of its rows have any correlation with any rows of `vecs`, by subtracting the outer product of `frame` with each normalized row of `vecs`. """ current_array = frame.copy().values for vec in vecs: vec = normalize_vec(vec) projection = current_array.dot(vec) np.subtract(current_array, np.outer(projection, vec), out=current_array) normalize(current_array, norm='l2', copy=False) current_array = pd.DataFrame(current_array, index=frame.index) current_array.fillna(0, inplace=True) return current_array
def get_vector(self, query, include_neighbors=True): """ Given one of the possible types of queries (see `similar_terms`), make a vector to look up from it. If there are 5 or fewer terms involved and `include_neighbors=True`, this will allow expanded_vector to look up neighboring terms in ConceptNet. """ self.load() if isinstance(query, pd.DataFrame) or isinstance(query, dict): terms = list(query.items()) elif isinstance(query, str): terms = [(query, 1.0)] elif isinstance(query, list): terms = query else: raise ValueError("Can't make a query out of type %s" % type(query)) include_neighbors = include_neighbors and (len(terms) <= 5) vec = self.expanded_vector(terms, include_neighbors=include_neighbors) return normalize_vec(vec)
def get_vector(self, query, include_neighbors=True): """ Given one of the possible types of queries (see `similar_terms`), make a vector to look up from it. If there are 5 or fewer terms involved and `include_neighbors=True`, this will allow expanded_vector to look up neighboring terms in ConceptNet. """ self.load() # FIXME: is pd.DataFrame supposed to be pd.Series here? if isinstance(query, pd.DataFrame) or isinstance(query, dict): terms = list(query.items()) elif isinstance(query, str): terms = [(query, 1.)] elif isinstance(query, list): terms = query else: raise ValueError("Can't make a query out of type %s" % type(query)) include_neighbors = include_neighbors and (len(terms) <= 5) vec = self.expanded_vector(terms, include_neighbors=include_neighbors) return normalize_vec(vec)