def main():
    apb = ArgumentParserBuilder()
    apb.add_opts(
        opt('--task', type=str, default='msmarco'),
        opt('--dataset', type=Path, required=True),
        opt('--index-dir', type=Path, required=True),
        opt('--method', required=True, type=str, choices=METHOD_CHOICES),
        opt('--model',
            required=True,
            type=str,
            help='Path to pre-trained model or huggingface model name'),
        opt('--duo_model',
            type=str,
            help='Path to pre-trained model or huggingface model name'),
        opt('--mono_hits',
            type=int,
            default=50,
            help='Top k candidates from mono for duo reranking'),
        opt('--output-file', type=Path, default='.'),
        opt('--overwrite-output', action='store_true'),
        opt('--split', type=str, default='dev', choices=('dev', 'eval')),
        opt('--batch-size', '-bsz', type=int, default=96),
        opt('--device', type=str, default='cuda:0'),
        opt('--is-duo', action='store_true'),
        opt('--from-tf', action='store_true'),
        opt('--metrics',
            type=str,
            nargs='+',
            default=metric_names(),
            choices=metric_names()), opt('--model-type', type=str),
        opt('--tokenizer-name', type=str))
    args = apb.parser.parse_args()
    options = PassageRankingEvaluationOptions(**vars(args))
    logging.info("Preprocessing Queries & Passages:")
    ds = MsMarcoDataset.from_folder(str(options.dataset),
                                    split=options.split,
                                    is_duo=options.is_duo)
    examples = ds.to_relevance_examples(str(options.index_dir),
                                        is_duo=options.is_duo)
    logging.info("Loading Ranker & Tokenizer:")
    construct_map = dict(transformer=construct_transformer,
                         bm25=construct_bm25,
                         t5=construct_t5,
                         duo_t5=construct_duo_t5,
                         seq_class_transformer=construct_seq_class_transformer,
                         random=lambda _: RandomReranker())
    reranker = construct_map[options.method](options)
    writer = MsMarcoWriter(args.output_file, args.overwrite_output)
    if options.method == 'duo_t5':
        evaluator = DuoRerankerEvaluator(mono_reranker=reranker[0],
                                         duo_reranker=reranker[1],
                                         metric_names=options.metrics,
                                         mono_hits=options.mono_hits,
                                         writer=writer)
    else:
        evaluator = RerankerEvaluator(reranker, options.metrics, writer=writer)
    width = max(map(len, args.metrics)) + 1
    logging.info("Reranking:")
    for metric in evaluator.evaluate(examples):
        logging.info(f'{metric.name:<{width}}{metric.value:.5}')
Exemplo n.º 2
0
def main():
    apb = ArgumentParserBuilder()
    apb.add_opts(opt('--dataset',
                     type=str,
                     default='msmarco'),
                 opt('--data-dir', type=Path, default='/content/data/msmarco'),
                 opt('--method',
                     required=True,
                     type=str,
                     choices=METHOD_CHOICES),
                 opt('--model-name-or-path', type=str),
                 opt('--output-file', type=Path, default='.'),
                 opt('--overwrite-output', action='store_true'),
                 opt('--split',
                     type=str,
                     default='dev',
                     choices=('dev', 'eval')),
                 opt('--batch-size', '-bsz', type=int, default=96),
                 opt('--device', type=str, default='cuda:0'),
                 opt('--is-duo', action='store_true'),
                 opt('--metrics',
                     type=str,
                     nargs='+',
                     default=metric_names(),
                     choices=metric_names()),
                 opt('--model-type', type=str, default='bert-base'),
                 opt('--tokenizer-name', type=str),
                 opt('--index-dir', type=Path))
    args = apb.parser.parse_args()
    options = PassageRankingEvaluationOptions(**vars(args))
    ds = MsMarcoDataset.from_folder(str(options.data_dir), split=options.split,
                                    is_duo=options.is_duo)
    examples = ds.to_relevance_examples(SETTINGS.msmarco_index_path,
                                        is_duo=options.is_duo)
    construct_map = dict(transformer=construct_transformer,
                         bm25=construct_bm25,
                         t5=construct_t5,
                         seq_class_transformer=construct_seq_class_transformer,
                         random=lambda _: RandomReranker())
    reranker = construct_map[options.method](options)
    writer = MsMarcoWriter(args.output_file, args.overwrite_output)
    evaluator = RerankerEvaluator(reranker, options.metrics, writer=writer)
    width = max(map(len, args.metrics)) + 1
    stdout = []
    for metric in evaluator.evaluate(examples):
        logging.info(f'{metric.name:<{width}}{metric.value:.5}')
        stdout.append(f'{metric.name}\t{metric.value}')
    print('\n'.join(stdout))