def load_wikIR(collection_path): left = pd.read_csv(collection_path + '/training/queries.csv', index_col='id_left') right = pd.read_csv(collection_path + '/documents.csv', index_col='id_right') relation = pd.read_csv(collection_path + '/training/BM25.qrels.csv', index_col=0) train_raw = mz.DataPack(left=left, right=right, relation=relation) left = pd.read_csv(collection_path + '/validation/queries.csv', index_col='id_left') right = pd.read_csv(collection_path + '/documents.csv', index_col='id_right') relation = pd.read_csv(collection_path + '/validation/BM25.qrels.csv', index_col=0) validation_raw = mz.DataPack(left=left, right=right, relation=relation) left = pd.read_csv(collection_path + '/test/queries.csv', index_col='id_left') right = pd.read_csv(collection_path + '/documents.csv', index_col='id_right') relation = pd.read_csv(collection_path + '/test/BM25.qrels.csv', index_col=0) test_raw = mz.DataPack(left=left, right=right, relation=relation) return train_raw, validation_raw, test_raw
def pack(df: pd.DataFrame, selected_columns_left: List[str], selected_columns_right: List[str]) -> 'matchzoo.DataPack': """ This is a customized class to avoid overriding code of Learning to Rank project Pack a :class:`DataPack` using `df`. The `df` must have `text_left` and `text_right` columns. Optionally, the `df` can have `id_left`, `id_right` to index `text_left` and `text_right` respectively. `id_left`, `id_right` will be automatically generated if not specified. :param df: Input :class:`pandas.DataFrame` to use. :param selected_columns_left: `List[str]` a list of columns selected for building data pack :param selected_columns_right: `List[str]` a list of columns selected for building data pack Examples:: >>> import matchzoo as mz >>> import pandas as pd >>> df = pd.DataFrame(data={'text_left': list('AABC'), ... 'text_right': list('abbc'), ... 'label': [0, 1, 1, 0]}) >>> mz.pack(df).frame() id_left text_left id_right text_right label 0 L-0 A R-0 a 0 1 L-0 A R-1 b 1 2 L-1 B R-1 b 1 3 L-2 C R-2 c 0 """ if 'text_left' not in df or 'text_right' not in df: raise ValueError( 'Input data frame must have `text_left` and `text_right`.') # Gather IDs if 'id_left' not in df: id_left = _gen_ids(df, 'text_left', 'L-') else: id_left = df['id_left'] if 'id_right' not in df: id_right = _gen_ids(df, 'text_right', 'R-') else: id_right = df['id_right'] # Build Relation relation = pd.DataFrame(data={'id_left': id_left, 'id_right': id_right}) for col in df: if col not in ['id_left', 'id_right', 'text_left', 'text_right']: relation[col] = df[col] # Build Left and Right left = _merge(df, id_left, selected_columns_left) right = _merge(df, id_right, selected_columns_right) return matchzoo.DataPack(relation, left, right)
def pack( df: pd.DataFrame, task: typing.Union[str, BaseTask] = 'ranking', ) -> 'matchzoo.DataPack': """ Pack a :class:`DataPack` using `df`. The `df` must have `text_left` and `text_right` columns. Optionally, the `df` can have `id_left`, `id_right` to index `text_left` and `text_right` respectively. `id_left`, `id_right` will be automatically generated if not specified. :param df: Input :class:`pandas.DataFrame` to use. :param task: Could be one of `ranking`, `classification` or a :class:`matchzoo.engine.BaseTask` instance. Examples:: >>> import matchzoo as mz >>> import pandas as pd >>> df = pd.DataFrame(data={'text_left': list('AABC'), ... 'text_right': list('abbc'), ... 'label': [0, 1, 1, 0]}) >>> mz.pack(df, task='classification').frame() id_left text_left id_right text_right label 0 L-0 A R-0 a 0 1 L-0 A R-1 b 1 2 L-1 B R-1 b 1 3 L-2 C R-2 c 0 >>> mz.pack(df, task='ranking').frame() id_left text_left id_right text_right label 0 L-0 A R-0 a 0.0 1 L-0 A R-1 b 1.0 2 L-1 B R-1 b 1.0 3 L-2 C R-2 c 0.0 """ if 'text_left' not in df or 'text_right' not in df: raise ValueError( 'Input data frame must have `text_left` and `text_right`.') # Gather IDs if 'id_left' not in df: id_left = _gen_ids(df, 'text_left', 'L-') else: id_left = df['id_left'] if 'id_right' not in df: id_right = _gen_ids(df, 'text_right', 'R-') else: id_right = df['id_right'] # Build Relation relation = pd.DataFrame(data={'id_left': id_left, 'id_right': id_right}) for col in df: if col not in ['id_left', 'id_right', 'text_left', 'text_right']: relation[col] = df[col] if 'label' in relation: if task == 'classification' or isinstance( task, matchzoo.tasks.Classification): relation['label'] = relation['label'].astype(int) elif task == 'ranking' or isinstance(task, matchzoo.tasks.Ranking): relation['label'] = relation['label'].astype(float) else: raise ValueError(f"{task} is not a valid task.") # Build Left and Right left = _merge(df, id_left, 'text_left', 'id_left') right = _merge(df, id_right, 'text_right', 'id_right') return matchzoo.DataPack(relation, left, right)
left_train = pd.merge(train_id_left, left, how="inner", on="id_left")[["id_left", "text_left"]] left_train.set_index("id_left", inplace=True) left_vali = pd.merge(vali_id_left, left, how="inner", on="id_left")[["id_left", "text_left"]] left_vali.set_index("id_left", inplace=True) left_test = pd.merge(test_id_left, left, how="inner", on="id_left")[["id_left", "text_left"]] left_test.set_index("id_left", inplace=True) right_train = pd.merge(relation_train.id_right.drop_duplicates(), right, how="inner", on="id_right")[["id_right", "text_right"]].drop_duplicates() right_train.set_index("id_right", inplace=True) right_vali = pd.merge(relation_vali.id_right.drop_duplicates(), right, how="inner", on="id_right")[["id_right", "text_right"]].drop_duplicates() right_vali.set_index("id_right", inplace=True) right_test = pd.merge(relation_test.id_right.drop_duplicates(), right, how="inner", on="id_right")[["id_right", "text_right"]].drop_duplicates() right_test.set_index("id_right", inplace=True) print('data loading ...') train_pack_raw = mz.DataPack(relation=relation_train,left=left_train,right=right_train) dev_pack_raw = mz.DataPack(relation=relation_vali,left=left_vali,right=right_vali) test_pack_raw = mz.DataPack(relation=relation_test,left=left_test,right=right_test) print('data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`') ranking_task = mz.tasks.Ranking(losses=mz.losses.RankHingeLoss()) #ranking_task = mz.tasks.Ranking(losses=mz.losses.RankCrossEntropyLoss()) ranking_task.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.NormalizedDiscountedCumulativeGain(k=5), mz.metrics.MeanAveragePrecision() ] print("`ranking_task` initialized with metrics", ranking_task.metrics) preprocessor = mz.preprocessors.BasicPreprocessor( truncated_length_left = 10,