def skip_assertion(source_dict, start, end): """ Filter out assertions that we can tell will be unhelpful after we've extracted them. """ if lemmatize_uri(start) in CONCEPT_BLACKLIST or lemmatize_uri(end) in CONCEPT_BLACKLIST: return True if source_dict['contributor'] in CONTRIBUTOR_BLACKLIST: return True if 'bedume' in source_dict['contributor']: for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES: check = '/' + flagged.replace(' ', '_') if start.endswith(check) or end.endswith(check): return True return False
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms to one whose row labels are standardized ConceptNet URIs (with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. """ # Re-label the DataFrame with standardized, non-unique row labels frame.index = [uri_prefix(standardized_uri(language, label)) for label in frame.index] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange(1, nrows + 1) label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort(ascending=False) result = scaled.loc[combined_weights.index] return result
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms (e.g. of the form 'en/term') to one whose row labels are standardized ConceptNet URIs (e.g. of the form '/c/en/term'; and with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. Args: frame (DataFrame): Term vectors DataFrame with indexed with terms. language (str, default='en): Use this language for labels that aren't already standardized. forms (bool, default=True): Combine terms with the same lemma. """ # Re-label the DataFrame with standardized, non-unique row labels # (this used to be a bug, see previous and new behavior comment below) #if all('/' in label for label in frame.index[10:20]): if all(label.count('/') == 1 for label in frame.index[10:20]): # previously partitioned label='/c/en/term' into tuple=('', '/', 'c/en/term') # into new label='/c//en_term', now partitions label='en/term' into tuple=('en', # '/', 'term') into new label=/c/en/term tuples = [label.partition('/') for label in frame.index] frame.index = [ uri_prefix(standardized_uri(language, text)) for language, _slash, text in tuples ] # `language` argument is only used here for labels that aren't already standardized frame.index = [ uri_prefix(standardized_uri(language, label)) for label in frame.index ] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange( 1, nrows + 1) # "with earlier rows given more weight" label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[ lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort_values(inplace=True, ascending=False) result = scaled.loc[combined_weights.index] return result
def skip_assertion(source_dict, start, end): """ Filter out assertions that we can tell will be unhelpful after we've extracted them. """ if ( lemmatize_uri(start) in CONCEPT_BLACKLIST or lemmatize_uri(end) in CONCEPT_BLACKLIST ): return True if source_dict['contributor'] in CONTRIBUTOR_BLACKLIST: return True if 'bedume' in source_dict['contributor']: for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES: check = '/' + flagged.replace(' ', '_') if start.endswith(check) or end.endswith(check): return True return False
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms (e.g. of the form 'en/term') to one whose row labels are standardized ConceptNet URIs (e.g. of the form '/c/en/term'; and with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. """ # Check for en/term format we use to train fastText on OpenSubtitles data if all(label.count('/') == 1 for label in frame.index[0:5]): tuples = [label.partition('/') for label in frame.index] frame.index = [ uri_prefix(standardized_uri(language, text)) for language, _slash, text in tuples ] # Re-label the DataFrame with standardized, non-unique row labels frame.index = [ uri_prefix(standardized_uri(language, label)) for label in frame.index ] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange(1, nrows + 1) label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[ lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort_values(inplace=True, ascending=False) result = scaled.loc[combined_weights.index] return result
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms (e.g. of the form 'en/term') to one whose row labels are standardized ConceptNet URIs (e.g. of the form '/c/en/term'; and with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. """ # Check for en/term format we use to train fastText on OpenSubtitles data if all(label.count('/') == 1 for label in frame.index[0:5]): tuples = [label.partition('/') for label in frame.index] frame.index = [ uri_prefix(standardized_uri(language, text)) for language, _slash, text in tuples ] # Re-label the DataFrame with standardized, non-unique row labels frame.index = [ uri_prefix(standardized_uri(language, label)) for label in frame.index ] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange(1, nrows + 1) label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort_values(inplace=True, ascending=False) result = scaled.loc[combined_weights.index] return result