示例#1
0
def _rename_or_drop_tid_columns(target):
    LOGGER.info(
        "Renaming '%s' column with no null values to '%s' "
        "& dropping '%s' columns with null values ...",
        keys.CATALOG_ID,
        keys.TID,
        keys.CATALOG_ID,
    )

    # If `catalog_id` is one column (i.e., a `Series`),
    # then it won't have None values
    if isinstance(target[keys.CATALOG_ID], pd.Series):
        target[keys.TID] = target[keys.CATALOG_ID]

    else:
        no_nulls = target[keys.CATALOG_ID].dropna(axis=1)
        # It may happen that more than 1 column has no null values:
        # in this case, they must be identical,
        # so take the first one
        target[keys.TID] = (no_nulls.iloc[:, 0] if isinstance(
            no_nulls, pd.DataFrame) else no_nulls)

    target.drop(columns=keys.CATALOG_ID, inplace=True)

    log_dataframe_info(
        LOGGER,
        target,
        f"Renamed '{keys.CATALOG_ID}' column with no null values to "
        f"'{keys.TID}' & dropped '{keys.CATALOG_ID}' columns with null values",
    )
示例#2
0
def _handle_dates(df, column):
    # Datasets are hitting pandas timestamp limitations, see
    # http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations
    # Parse into Period instead, see
    # http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-oob
    if df.get(column) is None:
        LOGGER.warning(
            "No '%s' column in DataFrame, won't handle its dates. Perhaps it was dropped because it contained null values only",
            column,
        )

    df[column] = df[column].map(_parse_dates_list, na_action='ignore')

    log_dataframe_info(LOGGER, df, 'Parsed dates')
示例#3
0
def _pair_dates(target):
    if _will_handle_birth_dates(target):
        LOGGER.info('Pairing birth date columns with precision ones ...')

        target[keys.DATE_OF_BIRTH] = list(
            zip(target[keys.DATE_OF_BIRTH], target[keys.BIRTH_PRECISION]))
        target.drop(columns=[keys.BIRTH_PRECISION], inplace=True)

        log_dataframe_info(LOGGER, target,
                           'Paired birth date columns with precision ones')

    if _will_handle_death_dates(target):
        LOGGER.info('Pairing death date columns with precision ones ...')

        target[keys.DATE_OF_DEATH] = list(
            zip(target[keys.DATE_OF_DEATH], target[keys.DEATH_PRECISION]))
        target.drop(columns=[keys.DEATH_PRECISION], inplace=True)

        log_dataframe_info(LOGGER, target,
                           'Paired death date columns with precision ones')
示例#4
0
def _drop_null_columns(target):
    target.dropna(axis=1, how='all', inplace=True)
    log_dataframe_info(LOGGER, target, 'Dropped columns with null values only')
示例#5
0
def preprocess_target(goal: str,
                      target_reader: Iterator[pd.DataFrame]) -> pd.DataFrame:
    """Preprocess a target catalog dataset: workflow step 2.

    This function consumes :class:`pandas.DataFrame` chunks and
    should be pipelined after :func:`build_target`.

    **Preprocessing actions:**

    1. drop unneeded columns holding target DB primary keys
    2. rename non-null catalog ID columns & drop others
    3. drop columns with null values only
    4. pair dates with their precision and drop precision columns
       when applicable
    5. aggregate denormalized data on target ID
    6. *(shared with* :func:`preprocess_wikidata` *)*
       normalize columns with names, occupations, dates, when applicable

    :param goal: ``{'training', 'classification'}``.
      Whether the dataset is for training or classification
    :param target_reader: a dataset reader as returned by
      :func:`build_target`
    :return: the generator yielding preprocessed
      :class:`pandas.DataFrame` chunks
    """
    utils.check_goal_value(goal)

    LOGGER.info('Preprocessing target ...')

    # Target data is denormalized, so we must consume the input generator
    # to perform consistent aggregations later
    target = pd.concat([chunk for chunk in target_reader], sort=False)

    # 1. Drop target DB internal ID columns
    LOGGER.info("Dropping '%s' columns ...", keys.INTERNAL_ID)
    target.drop(columns=keys.INTERNAL_ID, inplace=True)
    log_dataframe_info(LOGGER, target,
                       f"Dropped '{keys.INTERNAL_ID}'' columns")

    # 2. Rename non-null catalog ID column & drop others
    _rename_or_drop_tid_columns(target)

    # 3. Drop columns with null values only
    LOGGER.info('Dropping columns with null values only ...')
    _drop_null_columns(target)

    # 4. Pair dates with their precision & drop precision columns
    _pair_dates(target)

    # 5. Aggregate denormalized data on target ID
    # TODO Token lists may contain duplicate tokens
    LOGGER.info("Aggregating denormalized data on '%s' column ...", keys.TID)
    target = target.groupby(keys.TID).agg(lambda x: list(set(x)))
    log_dataframe_info(LOGGER, target,
                       f"Data indexed and aggregated on '{keys.TID}' column")

    # 6. Shared preprocessing
    target = _shared_preprocessing(
        target,
        _will_handle_birth_dates(target),
        _will_handle_death_dates(target),
    )

    LOGGER.info('Target preprocessing done')

    return target
示例#6
0
def preprocess_wikidata(goal: str,
                        wikidata_reader: JsonReader) -> Iterator[pd.DataFrame]:
    """Preprocess a Wikidata dataset: workflow step 2.

    This function consumes :class:`pandas.DataFrame` chunks and
    should be pipelined after :func:`build_wikidata`.

    **Preprocessing actions:**

    1. set QIDs as :class:`pandas.core.indexes.base.Index` of the chunk
    2. drop columns with null values only
    3. *(training)* ensure one target ID per QID
    4. tokenize names, URLs, genres, when applicable
    5. *(shared with* :func:`preprocess_target` *)*
       normalize columns with names, occupations, dates, when applicable

    :param goal: ``{'training', 'classification'}``.
      Whether the dataset is for training or classification
    :param wikidata_reader: a dataset reader as returned by
      :func:`build_wikidata`
    :return: the generator yielding preprocessed
      :class:`pandas.DataFrame` chunks
    """
    utils.check_goal_value(goal)

    LOGGER.info('Preprocessing Wikidata %s set ...', goal)

    for i, chunk in enumerate(wikidata_reader, 1):
        # 1. QID as index
        chunk.set_index(keys.QID, inplace=True)
        log_dataframe_info(LOGGER, chunk,
                           f"Built index from '{keys.QID}' column")

        # 2. Drop columns with null values only
        _drop_null_columns(chunk)

        # 3. Training only: ensure 1 target ID
        if goal == 'training':
            # This wipes out QIDs with > 1 positive samples,
            # but the impact can be neglected
            chunk[keys.TID] = chunk[keys.TID].map(
                lambda cell: cell[0] if isinstance(cell, list) else cell)

        # 4. Tokenize names
        for column in constants.NAME_FIELDS:
            if chunk.get(column) is not None:
                chunk[f'{column}_tokens'] = chunk[column].apply(
                    _tokenize_values, args=(text_utils.tokenize, ))

        # 4b. Tokenize genres if available
        if chunk.get(keys.GENRES) is not None:
            chunk[keys.GENRES] = chunk[keys.GENRES].apply(
                _tokenize_values, args=(text_utils.tokenize, ))

        # 5. Tokenize URLs
        chunk[keys.URL_TOKENS] = chunk[keys.URL].apply(
            _tokenize_values, args=(url_utils.tokenize, ))

        # 6. Shared preprocessing
        chunk = _shared_preprocessing(
            chunk,
            _will_handle_birth_dates(chunk),
            _will_handle_death_dates(chunk),
        )

        LOGGER.info('Chunk %d done', i)

        yield chunk