示例#1
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, "--path required for generation!"

    print(args)

    if args.source_lang is None:
        args.source_lang = "src"
    if args.target_lang is None:
        args.target_lang = "tgt"

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file)
    dataset = data.LanguageDatasets(src=args.source_lang,
                                    dst=args.target_lang,
                                    src_dict=src_dict,
                                    dst_dict=dst_dict)
    models, model_args = load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict)
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(a.append_eos_to_source == append_eos_to_source
               and a.reverse_source == reverse_source for a in model_args)
    dataset.splits[
        args.
        gen_subset] = pytorch_translate_data.make_language_pair_dataset_from_text(
            source_text_file=args.source_text_file,
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(
        f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    scorer, num_sentences, gen_timer = _generate_score(
        models=models,
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset)
    print(f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
          f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)")
    print(f"| Generate {args.gen_subset} with beam={args.beam}: "
          f"{scorer.result_string()}")
    return scorer.score()
示例#2
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, "--path required for generation!"

    print(args)

    # Benchmarking should be language-agnostic
    args.source_lang = "src"
    args.target_lang = "tgt"

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file)

    # Generate synthetic raw text files
    source_text_file = generate_synthetic_text(args.source_lang,
                                               src_dict.symbols, args)
    target_text_file = generate_synthetic_text(args.target_lang,
                                               src_dict.symbols, args)

    dataset = data.LanguageDatasets(src=args.source_lang,
                                    dst=args.target_lang,
                                    src_dict=src_dict,
                                    dst_dict=dst_dict)
    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict)
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(a.append_eos_to_source == append_eos_to_source
               and a.reverse_source == reverse_source for a in model_args)
    dataset.splits[
        args.
        gen_subset] = pytorch_translate_data.make_language_pair_dataset_from_text(
            source_text_file=source_text_file,
            target_text_file=target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

    # Remove temporary text files
    os.remove(source_text_file)
    os.remove(target_text_file)

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(
        f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    args.keep_detailed_timing = True
    scorer, num_sentences, gen_timer = pytorch_translate_generate._generate_score(
        models=models,
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset)

    # Remove contribution of primer sentence
    gen_timer.reset_bucket(0)

    print(
        f"| Translated {num_sentences} sentences ({sum(gen_timer.n)} tokens) "
        f"in {sum(gen_timer.sum):.3f}s ({1. / gen_timer.avg:.2f} tokens/s)")

    for bucket_id in range(gen_timer.n_buckets):
        if gen_timer.n[bucket_id] != 0:
            print(
                "  | Length {}: {} sentences ({} tok) in {:.3f}s ({:.3f} tok/s, avg. latency {:4f}s)"
                .format(
                    bucket_id * args.increment,
                    gen_timer.count[bucket_id],
                    gen_timer.n[bucket_id],
                    gen_timer.sum[bucket_id],
                    1. / gen_timer.avgs[bucket_id],
                    gen_timer.sum[bucket_id] / gen_timer.count[bucket_id],
                ))

    print(f"| Generate {args.gen_subset} with beam={args.beam}: "
          f"{scorer.result_string()}")
    return scorer.score()
示例#3
0
def generate(args):
    pytorch_translate_options.print_args(args)

    src_dict = pytorch_translate_dictionary.Dictionary.load(args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(args.target_vocab_file)
    use_char_source = args.char_source_vocab_file != ""
    if use_char_source:
        char_source_dict = pytorch_translate_dictionary.Dictionary.load(
            args.char_source_vocab_file
        )
        # this attribute is used for CharSourceModel construction
        args.char_source_dict_size = len(char_source_dict)
    else:
        char_source_dict = None

    dataset = data.LanguageDatasets(
        src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict
    )
    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict
    )
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(
        a.append_eos_to_source == append_eos_to_source
        and a.reverse_source == reverse_source
        for a in model_args
    )
    if args.source_binary_file != "":
        assert args.target_binary_file != ""
        dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            args.target_binary_file
        )
        if use_char_source:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                args.source_binary_file
            )
            gen_split = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=src_dict.pad(),
                eos_idx=dst_dict.eos(),
            )
        else:
            src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                args.source_binary_file
            )
            gen_split = data.LanguagePairDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=src_dict.pad(),
                eos_idx=dst_dict.eos(),
            )
    elif pytorch_translate_data.is_multilingual(args):
        gen_split = pytorch_translate_data.make_language_pair_dataset_from_text_multilingual(
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_lang_id=args.multiling_source_lang_id,
            target_lang_id=args.multiling_target_lang_id,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    elif args.source_ensembling:
        gen_split = multisource_data.make_multisource_language_pair_dataset_from_text(
            source_text_files=args.source_text_file,
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    else:
        gen_split = pytorch_translate_data.make_language_pair_dataset_from_text(
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
            char_source_dict=char_source_dict,
        )
    dataset.splits[args.gen_subset] = gen_split

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    scorer, num_sentences, gen_timer, _ = _generate_score(
        models=models, args=args, dataset=dataset, dataset_split=args.gen_subset
    )
    print(
        f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
        f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)"
    )
    print(
        f"| Generate {args.gen_subset} with beam={args.beam}: "
        f"{scorer.result_string()}"
    )
    return scorer.score()