예제 #1
0
def skip_assertion(source_dict, start, end):
    """
    Filter out assertions that we can tell will be unhelpful after we've
    extracted them.
    """
    if lemmatize_uri(start) in CONCEPT_BLACKLIST or lemmatize_uri(end) in CONCEPT_BLACKLIST:
        return True
    if source_dict['contributor'] in CONTRIBUTOR_BLACKLIST:
        return True
    if 'bedume' in source_dict['contributor']:
        for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES:
            check = '/' + flagged.replace(' ', '_')
            if start.endswith(check) or end.endswith(check):
                return True
    return False
예제 #2
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms to one whose row
    labels are standardized ConceptNet URIs (with some extra word2vec-style
    normalization of digits). Rows whose labels get the same
    standardized URI get combined, with earlier rows given more weight.
    """
    # Re-label the DataFrame with standardized, non-unique row labels
    frame.index = [uri_prefix(standardized_uri(language, label)) for label in frame.index]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(1, nrows + 1)
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort(ascending=False)
    result = scaled.loc[combined_weights.index]
    return result
예제 #3
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms (e.g. of the
    form 'en/term') to one whose row labels are standardized ConceptNet URIs
    (e.g. of the form '/c/en/term'; and with some extra word2vec-style
    normalization of digits). Rows whose labels get the same standardized
    URI get combined, with earlier rows given more weight.
    Args:
        frame (DataFrame): Term vectors DataFrame with indexed with terms.
        language (str, default='en): Use this language for labels that aren't
            already standardized.
        forms (bool, default=True): Combine terms with the same lemma.
    """
    # Re-label the DataFrame with standardized, non-unique row labels
    # (this used to be a bug, see previous and new behavior comment below)
    #if all('/' in label for label in frame.index[10:20]):
    if all(label.count('/') == 1 for label in frame.index[10:20]):

        # previously partitioned label='/c/en/term' into tuple=('', '/', 'c/en/term')
        # into new label='/c//en_term', now partitions label='en/term' into tuple=('en',
        # '/', 'term') into new label=/c/en/term
        tuples = [label.partition('/') for label in frame.index]
        frame.index = [
            uri_prefix(standardized_uri(language, text))
            for language, _slash, text in tuples
        ]

    # `language` argument is only used here for labels that aren't already standardized
    frame.index = [
        uri_prefix(standardized_uri(language, label)) for label in frame.index
    ]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(
        1, nrows + 1)  # "with earlier rows given more weight"
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights,
                          axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[
                    lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort_values(inplace=True, ascending=False)
    result = scaled.loc[combined_weights.index]
    return result
예제 #4
0
def skip_assertion(source_dict, start, end):
    """
    Filter out assertions that we can tell will be unhelpful after we've
    extracted them.
    """
    if (
        lemmatize_uri(start) in CONCEPT_BLACKLIST
        or lemmatize_uri(end) in CONCEPT_BLACKLIST
    ):
        return True
    if source_dict['contributor'] in CONTRIBUTOR_BLACKLIST:
        return True
    if 'bedume' in source_dict['contributor']:
        for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES:
            check = '/' + flagged.replace(' ', '_')
            if start.endswith(check) or end.endswith(check):
                return True
    return False
예제 #5
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms (e.g. of the
    form 'en/term') to one whose row labels are standardized ConceptNet URIs
    (e.g. of the form '/c/en/term'; and with some extra word2vec-style
    normalization of digits). Rows whose labels get the same standardized
    URI get combined, with earlier rows given more weight.
    """
    # Check for en/term format we use to train fastText on OpenSubtitles data
    if all(label.count('/') == 1 for label in frame.index[0:5]):
        tuples = [label.partition('/') for label in frame.index]
        frame.index = [
            uri_prefix(standardized_uri(language, text))
            for language, _slash, text in tuples
        ]

    # Re-label the DataFrame with standardized, non-unique row labels
    frame.index = [
        uri_prefix(standardized_uri(language, label)) for label in frame.index
    ]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(1, nrows + 1)
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights,
                          axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[
                    lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort_values(inplace=True, ascending=False)
    result = scaled.loc[combined_weights.index]
    return result
예제 #6
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms (e.g. of the
    form 'en/term') to one whose row labels are standardized ConceptNet URIs
    (e.g. of the form '/c/en/term'; and with some extra word2vec-style
    normalization of digits). Rows whose labels get the same standardized
    URI get combined, with earlier rows given more weight.
    """
    # Check for en/term format we use to train fastText on OpenSubtitles data
    if all(label.count('/') == 1 for label in frame.index[0:5]):
        tuples = [label.partition('/') for label in frame.index]
        frame.index = [
            uri_prefix(standardized_uri(language, text))
            for language, _slash, text in tuples
        ]

    # Re-label the DataFrame with standardized, non-unique row labels
    frame.index = [
        uri_prefix(standardized_uri(language, label)) for label in frame.index
    ]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(1, nrows + 1)
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort_values(inplace=True, ascending=False)
    result = scaled.loc[combined_weights.index]
    return result