예제 #1
0
def _add_name_tokens_features(feature_extractor):
    name_tokens_column = keys.NAME_TOKENS

    # Levenshtein distance on name tokens
    feature_extractor.add(
        features.SimilarStrings(
            name_tokens_column,
            name_tokens_column,
            label=f'{name_tokens_column}_levenshtein',
        ))

    # String kernel similarity on name tokens
    feature_extractor.add(
        features.SimilarStrings(
            name_tokens_column,
            name_tokens_column,
            algorithm='cosine',
            analyzer='char_wb',
            label=f'{name_tokens_column}_string_kernel_cosine',
        ))

    # Weighted intersection of name tokens
    feature_extractor.add(
        features.SharedTokens(
            name_tokens_column,
            name_tokens_column,
            label=f'{name_tokens_column}_shared',
        ))
예제 #2
0
def extract_features(
    candidate_pairs: pd.MultiIndex,
    wikidata: pd.DataFrame,
    target: pd.DataFrame,
    path_io: str,
) -> pd.DataFrame:
    """Extract feature vectors by comparing pairs of
    *(Wikidata, target catalog)* records.

    **Main features:**

    - exact match on full names and URLs
    - match on tokenized names, URLs, and genres
    - `Levenshtein distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_
      on name tokens
    - `string kernel <https://en.wikipedia.org/wiki/String_kernel>`_
      similarity on name tokens
    - weighted intersection on name tokens
    - match on dates by maximum shared precision
    - `cosine similarity <https://en.wikipedia.org/wiki/Cosine_similarity>`_
      on textual descriptions
    - match on occupation QIDs

    See :mod:`features` for more details.

    This function uses multithreaded parallel processing.

    :param candidate_pairs: an index of *(QID, target ID)* pairs
      that should undergo comparison
    :param wikidata: a preprocessed Wikidata dataset (typically a chunk)
    :param target: a preprocessed target catalog dataset (typically a chunk)
    :param path_io: input/output path to an extracted feature file
    :return: the feature vectors dataset
    """
    LOGGER.info('Extracting features ...')

    # Early return cached features, for development purposes
    if os.path.isfile(path_io):
        LOGGER.info("Will reuse existing features: '%s'", path_io)
        return pd.read_pickle(path_io)

    def in_both_datasets(col: str) -> bool:
        return (col in wikidata.columns) and (col in target.columns)

    feature_extractor = rl.Compare(n_jobs=cpu_count())

    # Exact match on full name
    name_column = keys.NAME
    if in_both_datasets(name_column):
        feature_extractor.add(
            features.ExactMatch(name_column,
                                name_column,
                                label=f'{name_column}_exact'))

    # URL features
    if in_both_datasets(keys.URL):
        _add_url_features(feature_extractor)

    # Date features
    _add_date_features(feature_extractor, in_both_datasets)

    # Name tokens features
    if in_both_datasets(keys.NAME_TOKENS):
        _add_name_tokens_features(feature_extractor)

    # Cosine similarity on description
    description_column = keys.DESCRIPTION
    if in_both_datasets(description_column):
        feature_extractor.add(
            features.SimilarStrings(
                description_column,
                description_column,
                algorithm='cosine',
                analyzer='soweego',
                label=f'{description_column}_cosine',
            ))

    # Match on occupation QIDs
    occupations_column = keys.OCCUPATIONS
    if in_both_datasets(occupations_column):
        feature_extractor.add(
            features.SharedOccupations(
                occupations_column,
                occupations_column,
                label=f'{occupations_column}_shared',
            ))

    # Match on tokenized genres
    genres_column = keys.GENRES
    if in_both_datasets(genres_column):
        feature_extractor.add(
            features.SharedTokens(
                genres_column,
                genres_column,
                label=f'{genres_column}_tokens_shared',
            ))

    feature_vectors = feature_extractor.compute(candidate_pairs, wikidata,
                                                target)
    feature_vectors = feature_vectors[~feature_vectors.index.duplicated(
    )  # Drop duplicates
                                      ]

    os.makedirs(os.path.dirname(path_io), exist_ok=True)
    pd.to_pickle(feature_vectors, path_io)
    LOGGER.info("Features dumped to '%s'", path_io)

    LOGGER.info('Feature extraction done')

    return feature_vectors