Пример #1
0
def load_wikIR(collection_path):

    left = pd.read_csv(collection_path + '/training/queries.csv',
                       index_col='id_left')
    right = pd.read_csv(collection_path + '/documents.csv',
                        index_col='id_right')
    relation = pd.read_csv(collection_path + '/training/BM25.qrels.csv',
                           index_col=0)
    train_raw = mz.DataPack(left=left, right=right, relation=relation)

    left = pd.read_csv(collection_path + '/validation/queries.csv',
                       index_col='id_left')
    right = pd.read_csv(collection_path + '/documents.csv',
                        index_col='id_right')
    relation = pd.read_csv(collection_path + '/validation/BM25.qrels.csv',
                           index_col=0)
    validation_raw = mz.DataPack(left=left, right=right, relation=relation)

    left = pd.read_csv(collection_path + '/test/queries.csv',
                       index_col='id_left')
    right = pd.read_csv(collection_path + '/documents.csv',
                        index_col='id_right')
    relation = pd.read_csv(collection_path + '/test/BM25.qrels.csv',
                           index_col=0)
    test_raw = mz.DataPack(left=left, right=right, relation=relation)

    return train_raw, validation_raw, test_raw
Пример #2
0
def pack(df: pd.DataFrame, selected_columns_left: List[str],
         selected_columns_right: List[str]) -> 'matchzoo.DataPack':
    """
    This is a customized class to avoid overriding code of Learning to Rank project
    Pack a :class:`DataPack` using `df`.

    The `df` must have `text_left` and `text_right` columns. Optionally,
    the `df` can have `id_left`, `id_right` to index `text_left` and
    `text_right` respectively. `id_left`, `id_right` will be automatically
    generated if not specified.

    :param df: Input :class:`pandas.DataFrame` to use.
    :param selected_columns_left: `List[str]` a list of columns selected for building data pack
    :param selected_columns_right: `List[str]` a list of columns selected for building data pack

    Examples::
        >>> import matchzoo as mz
        >>> import pandas as pd
        >>> df = pd.DataFrame(data={'text_left': list('AABC'),
        ...                         'text_right': list('abbc'),
        ...                         'label': [0, 1, 1, 0]})
        >>> mz.pack(df).frame()
          id_left text_left id_right text_right  label
        0     L-0         A      R-0          a      0
        1     L-0         A      R-1          b      1
        2     L-1         B      R-1          b      1
        3     L-2         C      R-2          c      0

    """
    if 'text_left' not in df or 'text_right' not in df:
        raise ValueError(
            'Input data frame must have `text_left` and `text_right`.')

    # Gather IDs
    if 'id_left' not in df:
        id_left = _gen_ids(df, 'text_left', 'L-')
    else:
        id_left = df['id_left']
    if 'id_right' not in df:
        id_right = _gen_ids(df, 'text_right', 'R-')
    else:
        id_right = df['id_right']

    # Build Relation
    relation = pd.DataFrame(data={'id_left': id_left, 'id_right': id_right})
    for col in df:
        if col not in ['id_left', 'id_right', 'text_left', 'text_right']:
            relation[col] = df[col]

    # Build Left and Right
    left = _merge(df, id_left, selected_columns_left)
    right = _merge(df, id_right, selected_columns_right)
    return matchzoo.DataPack(relation, left, right)
Пример #3
0
def pack(
    df: pd.DataFrame,
    task: typing.Union[str, BaseTask] = 'ranking',
) -> 'matchzoo.DataPack':
    """
    Pack a :class:`DataPack` using `df`.

    The `df` must have `text_left` and `text_right` columns. Optionally,
    the `df` can have `id_left`, `id_right` to index `text_left` and
    `text_right` respectively. `id_left`, `id_right` will be automatically
    generated if not specified.

    :param df: Input :class:`pandas.DataFrame` to use.
    :param task: Could be one of `ranking`, `classification` or a
        :class:`matchzoo.engine.BaseTask` instance.

    Examples::
        >>> import matchzoo as mz
        >>> import pandas as pd
        >>> df = pd.DataFrame(data={'text_left': list('AABC'),
        ...                         'text_right': list('abbc'),
        ...                         'label': [0, 1, 1, 0]})
        >>> mz.pack(df, task='classification').frame()
          id_left text_left id_right text_right  label
        0     L-0         A      R-0          a      0
        1     L-0         A      R-1          b      1
        2     L-1         B      R-1          b      1
        3     L-2         C      R-2          c      0
        >>> mz.pack(df, task='ranking').frame()
          id_left text_left id_right text_right  label
        0     L-0         A      R-0          a    0.0
        1     L-0         A      R-1          b    1.0
        2     L-1         B      R-1          b    1.0
        3     L-2         C      R-2          c    0.0

    """
    if 'text_left' not in df or 'text_right' not in df:
        raise ValueError(
            'Input data frame must have `text_left` and `text_right`.')

    # Gather IDs
    if 'id_left' not in df:
        id_left = _gen_ids(df, 'text_left', 'L-')
    else:
        id_left = df['id_left']
    if 'id_right' not in df:
        id_right = _gen_ids(df, 'text_right', 'R-')
    else:
        id_right = df['id_right']

    # Build Relation
    relation = pd.DataFrame(data={'id_left': id_left, 'id_right': id_right})
    for col in df:
        if col not in ['id_left', 'id_right', 'text_left', 'text_right']:
            relation[col] = df[col]
    if 'label' in relation:
        if task == 'classification' or isinstance(
                task, matchzoo.tasks.Classification):
            relation['label'] = relation['label'].astype(int)
        elif task == 'ranking' or isinstance(task, matchzoo.tasks.Ranking):
            relation['label'] = relation['label'].astype(float)
        else:
            raise ValueError(f"{task} is not a valid task.")

    # Build Left and Right
    left = _merge(df, id_left, 'text_left', 'id_left')
    right = _merge(df, id_right, 'text_right', 'id_right')
    return matchzoo.DataPack(relation, left, right)
Пример #4
0
left_train = pd.merge(train_id_left, left, how="inner", on="id_left")[["id_left", "text_left"]]
left_train.set_index("id_left", inplace=True)
left_vali = pd.merge(vali_id_left, left, how="inner", on="id_left")[["id_left", "text_left"]]
left_vali.set_index("id_left", inplace=True)
left_test = pd.merge(test_id_left, left, how="inner", on="id_left")[["id_left", "text_left"]]
left_test.set_index("id_left", inplace=True)

right_train = pd.merge(relation_train.id_right.drop_duplicates(), right, how="inner", on="id_right")[["id_right", "text_right"]].drop_duplicates()
right_train.set_index("id_right", inplace=True)
right_vali = pd.merge(relation_vali.id_right.drop_duplicates(), right, how="inner", on="id_right")[["id_right", "text_right"]].drop_duplicates()
right_vali.set_index("id_right", inplace=True)
right_test = pd.merge(relation_test.id_right.drop_duplicates(), right, how="inner", on="id_right")[["id_right", "text_right"]].drop_duplicates()
right_test.set_index("id_right", inplace=True)

print('data loading ...')
train_pack_raw = mz.DataPack(relation=relation_train,left=left_train,right=right_train)
dev_pack_raw = mz.DataPack(relation=relation_vali,left=left_vali,right=right_vali)
test_pack_raw = mz.DataPack(relation=relation_test,left=left_test,right=right_test)
print('data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`')

ranking_task = mz.tasks.Ranking(losses=mz.losses.RankHingeLoss())
#ranking_task = mz.tasks.Ranking(losses=mz.losses.RankCrossEntropyLoss())
ranking_task.metrics = [
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5),
    mz.metrics.MeanAveragePrecision()
]
print("`ranking_task` initialized with metrics", ranking_task.metrics)

preprocessor = mz.preprocessors.BasicPreprocessor(
    truncated_length_left = 10,