예제 #1
0
def load_binarized_dataset(
    train_corpus: ParallelCorpusConfig,
    eval_corpus: ParallelCorpusConfig,
    train_split: str,
    eval_split: str,
    args: argparse.Namespace,
    use_char_source: bool = False,
) -> data.LanguageDatasets:
    source_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file)
    target_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file)

    if use_char_source:
        char_source_dict = pytorch_translate_dictionary.Dictionary.load(
            args.char_source_vocab_file)
        # this attribute is used for CharSourceModel construction
        args.char_source_dict_size = len(char_source_dict)

    dataset = data.LanguageDatasets(
        src=train_corpus.source.dialect,
        dst=train_corpus.target.dialect,
        src_dict=source_dict,
        dst_dict=target_dict,
    )

    for split, corpus in [(train_split, train_corpus),
                          (eval_split, eval_corpus)]:
        if not os.path.exists(corpus.source.data_file):
            raise ValueError(
                f"{corpus.source.data_file} for {split} not found!")
        if not os.path.exists(corpus.target.data_file):
            raise ValueError(
                f"{corpus.target.data_file} for {split} not found!")

        dst_dataset = InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file)
        if use_char_source:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                corpus.source.data_file)
            dataset.splits[split] = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=source_dict.pad(),
                eos_idx=source_dict.eos(),
            )
        else:

            src_dataset = InMemoryNumpyDataset.create_from_file(
                corpus.source.data_file)
            dataset.splits[split] = data.LanguagePairDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=source_dict.pad(),
                eos_idx=source_dict.eos(),
            )

    return dataset
예제 #2
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, '--path required for generation!'

    print(args)

    if args.source_lang is None:
        args.source_lang = 'src'
    if args.target_lang is None:
        args.target_lang = 'tgt'

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file,
    )
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file,
    )
    dataset = data.LanguageDatasets(
        src=args.source_lang,
        dst=args.target_lang,
        src_dict=src_dict,
        dst_dict=dst_dict,
    )
    models, model_args = utils.load_ensemble_for_inference(
        args.path,
        dataset.src_dict,
        dataset.dst_dict,
    )
    dataset.splits[args.gen_subset] = pytorch_translate_data.make_language_pair_dataset(
        source_file=args.source_text_file,
        target_file=args.target_text_file,
        source_dict=src_dict,
        target_dict=dst_dict,
        append_eos=model_args.append_eos_to_source,
        reverse_source=model_args.reverse_source,
    )

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f'| [{dataset.src}] dictionary: {dataset.src_dict} types')
    print(f'| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types')
    print(f'| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples')
    scorer, num_sentences, gen_timer = _generate_score(
        models=models,
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset,
    )
    print(f'| Translated {num_sentences} sentences ({gen_timer.n} tokens) '
          f'in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)')
    print(f'| Generate {args.gen_subset} with beam={args.beam}: '
          f'{scorer.result_string()}')
    return scorer.score()
예제 #3
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, "--path required for generation!"

    print(args)

    if args.source_lang is None:
        args.source_lang = "src"
    if args.target_lang is None:
        args.target_lang = "tgt"

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file)
    dataset = data.LanguageDatasets(src=args.source_lang,
                                    dst=args.target_lang,
                                    src_dict=src_dict,
                                    dst_dict=dst_dict)
    models, model_args = load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict)
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(a.append_eos_to_source == append_eos_to_source
               and a.reverse_source == reverse_source for a in model_args)
    dataset.splits[
        args.
        gen_subset] = pytorch_translate_data.make_language_pair_dataset_from_text(
            source_text_file=args.source_text_file,
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(
        f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    scorer, num_sentences, gen_timer = _generate_score(
        models=models,
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset)
    print(f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
          f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)")
    print(f"| Generate {args.gen_subset} with beam={args.beam}: "
          f"{scorer.result_string()}")
    return scorer.score()
예제 #4
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, '--path required for generation!'

    print(args)

    if args.source_lang is None:
        args.source_lang = 'src'
    if args.target_lang is None:
        args.target_lang = 'tgt'

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file, )
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file, )
    dataset = data.LanguageDatasets(
        src=args.source_lang,
        dst=args.target_lang,
        src_dict=src_dict,
        dst_dict=dst_dict,
    )
    dataset.splits[
        args.gen_subset] = pytorch_translate_data.make_language_pair_dataset(
            source_file=args.source_text_file,
            target_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            args=args,
        )

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    print(
        '| {} {} examples'.format(args.gen_subset,
                                  len(dataset.splits[args.gen_subset])), )

    scorer, num_sentences, gen_timer = generate_score(
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset,
    )
    print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.
          format(num_sentences, gen_timer.n, gen_timer.sum,
                 1. / gen_timer.avg))
    print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam,
                                                  scorer.result_string()))
    return scorer.score()
예제 #5
0
파일: data.py 프로젝트: sohuren/translate
def load_raw_text_dataset(
    train_corpus: ParallelCorpusConfig,
    eval_corpus: ParallelCorpusConfig,
    train_split: str,
    eval_split: str,
    args: argparse.Namespace,
) -> data.LanguageDatasets:
    source_dict = pytorch_translate_dictionary.Dictionary.load(args.source_vocab_file)
    target_dict = pytorch_translate_dictionary.Dictionary.load(args.target_vocab_file)

    dataset = data.LanguageDatasets(
        src=train_corpus.source.dialect,
        dst=train_corpus.target.dialect,
        src_dict=source_dict,
        dst_dict=target_dict,
    )

    prev_source_dialect = None
    prev_target_dialect = None

    for split, corpus in [
        (train_split, train_corpus),
        (eval_split, eval_corpus),
    ]:
        # Sanity check that all language directions are consistent until this
        # has been updated to support multilingual corpora.
        if prev_source_dialect is None and prev_target_dialect is None:
            prev_source_dialect = corpus.source.dialect
            prev_target_dialect = corpus.target.dialect
        elif (prev_source_dialect != corpus.source.dialect or
                prev_target_dialect != corpus.target.dialect):
            raise ValueError(
                'We currently only support monolingual directions - expected '
                '{}->{} for all corpora, but found {}->{} for split {}'.format(
                    prev_source_dialect,
                    prev_target_dialect,
                    corpus.source.dialect,
                    corpus.target.dialect,
                    split,
                )
            )

        dataset.splits[split] = make_language_pair_dataset(
            source_file=corpus.source.data_file,
            target_file=corpus.target.data_file,
            source_dict=source_dict,
            target_dict=target_dict,
            args=args,
        )
    return dataset
예제 #6
0
def load_binarized_dataset(
    train_corpus: ParallelCorpusConfig,
    eval_corpus: ParallelCorpusConfig,
    train_split: str,
    eval_split: str,
    args: argparse.Namespace,
) -> data.LanguageDatasets:
    source_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file)
    target_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file)

    dataset = data.LanguageDatasets(
        src=train_corpus.source.dialect,
        dst=train_corpus.target.dialect,
        src_dict=source_dict,
        dst_dict=target_dict,
    )

    for split, corpus in [(train_split, train_corpus),
                          (eval_split, eval_corpus)]:
        if (not indexed_dataset.IndexedInMemoryDataset.exists(
                corpus.source.data_file)
                or not indexed_dataset.IndexedInMemoryDataset.exists(
                    corpus.target.data_file)):
            raise ValueError(
                f"One or both of source file: {corpus.source.data_file} and "
                f"target file: {corpus.target.data_file} for split {split} "
                f"was not found.")

        dataset.splits[split] = data.LanguagePairDataset(
            src=indexed_dataset.IndexedInMemoryDataset(
                corpus.source.data_file),
            dst=indexed_dataset.IndexedInMemoryDataset(
                corpus.target.data_file),
            pad_idx=source_dict.pad(),
            eos_idx=source_dict.eos(),
        )

    return dataset
예제 #7
0
파일: data.py 프로젝트: kc17/translate
def load_binarized_dataset(
    train_corpus: ParallelCorpusConfig,
    eval_corpus: ParallelCorpusConfig,
    train_split: str,
    eval_split: str,
    args: argparse.Namespace,
) -> data.LanguageDatasets:
    source_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file)
    target_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file)

    dataset = data.LanguageDatasets(
        src=train_corpus.source.dialect,
        dst=train_corpus.target.dialect,
        src_dict=source_dict,
        dst_dict=target_dict,
    )

    for split, corpus in [(train_split, train_corpus),
                          (eval_split, eval_corpus)]:
        if not os.path.exists(corpus.source.data_file):
            raise ValueError(
                f"{corpus.source.data_file} for {split} not found!")
        if not os.path.exists(corpus.target.data_file):
            raise ValueError(
                f"{corpus.target.data_file} for {split} not found!")

        dataset.splits[split] = data.LanguagePairDataset(
            src=InMemoryNumpyDataset.create_from_file(corpus.source.data_file),
            dst=InMemoryNumpyDataset.create_from_file(corpus.target.data_file),
            pad_idx=source_dict.pad(),
            eos_idx=source_dict.eos(),
        )

    return dataset
예제 #8
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, "--path required for generation!"

    print(args)

    # Benchmarking should be language-agnostic
    args.source_lang = "src"
    args.target_lang = "tgt"

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file)

    # Generate synthetic raw text files
    source_text_file = generate_synthetic_text(args.source_lang,
                                               src_dict.symbols, args)
    target_text_file = generate_synthetic_text(args.target_lang,
                                               src_dict.symbols, args)

    dataset = data.LanguageDatasets(src=args.source_lang,
                                    dst=args.target_lang,
                                    src_dict=src_dict,
                                    dst_dict=dst_dict)
    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict)
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(a.append_eos_to_source == append_eos_to_source
               and a.reverse_source == reverse_source for a in model_args)
    dataset.splits[
        args.
        gen_subset] = pytorch_translate_data.make_language_pair_dataset_from_text(
            source_text_file=source_text_file,
            target_text_file=target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

    # Remove temporary text files
    os.remove(source_text_file)
    os.remove(target_text_file)

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(
        f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    args.keep_detailed_timing = True
    scorer, num_sentences, gen_timer = pytorch_translate_generate._generate_score(
        models=models,
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset)

    # Remove contribution of primer sentence
    gen_timer.reset_bucket(0)

    print(
        f"| Translated {num_sentences} sentences ({sum(gen_timer.n)} tokens) "
        f"in {sum(gen_timer.sum):.3f}s ({1. / gen_timer.avg:.2f} tokens/s)")

    for bucket_id in range(gen_timer.n_buckets):
        if gen_timer.n[bucket_id] != 0:
            print(
                "  | Length {}: {} sentences ({} tok) in {:.3f}s ({:.3f} tok/s, avg. latency {:4f}s)"
                .format(
                    bucket_id * args.increment,
                    gen_timer.count[bucket_id],
                    gen_timer.n[bucket_id],
                    gen_timer.sum[bucket_id],
                    1. / gen_timer.avgs[bucket_id],
                    gen_timer.sum[bucket_id] / gen_timer.count[bucket_id],
                ))

    print(f"| Generate {args.gen_subset} with beam={args.beam}: "
          f"{scorer.result_string()}")
    return scorer.score()
예제 #9
0
def generate(args):
    pytorch_translate_options.print_args(args)

    src_dict = pytorch_translate_dictionary.Dictionary.load(args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(args.target_vocab_file)
    use_char_source = args.char_source_vocab_file != ""
    if use_char_source:
        char_source_dict = pytorch_translate_dictionary.Dictionary.load(
            args.char_source_vocab_file
        )
        # this attribute is used for CharSourceModel construction
        args.char_source_dict_size = len(char_source_dict)
    else:
        char_source_dict = None

    dataset = data.LanguageDatasets(
        src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict
    )
    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict
    )
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(
        a.append_eos_to_source == append_eos_to_source
        and a.reverse_source == reverse_source
        for a in model_args
    )
    if args.source_binary_file != "":
        assert args.target_binary_file != ""
        dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            args.target_binary_file
        )
        if use_char_source:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                args.source_binary_file
            )
            gen_split = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=src_dict.pad(),
                eos_idx=dst_dict.eos(),
            )
        else:
            src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                args.source_binary_file
            )
            gen_split = data.LanguagePairDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=src_dict.pad(),
                eos_idx=dst_dict.eos(),
            )
    elif pytorch_translate_data.is_multilingual(args):
        gen_split = pytorch_translate_data.make_language_pair_dataset_from_text_multilingual(
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_lang_id=args.multiling_source_lang_id,
            target_lang_id=args.multiling_target_lang_id,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    elif args.source_ensembling:
        gen_split = multisource_data.make_multisource_language_pair_dataset_from_text(
            source_text_files=args.source_text_file,
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    else:
        gen_split = pytorch_translate_data.make_language_pair_dataset_from_text(
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
            char_source_dict=char_source_dict,
        )
    dataset.splits[args.gen_subset] = gen_split

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    scorer, num_sentences, gen_timer, _ = _generate_score(
        models=models, args=args, dataset=dataset, dataset_split=args.gen_subset
    )
    print(
        f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
        f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)"
    )
    print(
        f"| Generate {args.gen_subset} with beam={args.beam}: "
        f"{scorer.result_string()}"
    )
    return scorer.score()