Пример #1
0
 def _init_task(self):
     src_dict = Dictionary()
     self.symbols = "abcdefghijklmnopqrstuvwxyz"
     for symbol in self.symbols[:self.n_symbols]:
         src_dict.add_symbol(symbol)
     dst_dict = Dictionary()
     self.task = PytorchTranslateTask(None, src_dict, dst_dict)
Пример #2
0
def make_language_pair_dataset_from_text(
    source_text_file: str,
    target_text_file: str,
    source_dict: pytorch_translate_dictionary.Dictionary,
    target_dict: pytorch_translate_dictionary.Dictionary,
    append_eos: Optional[bool] = False,
    reverse_source: Optional[bool] = True,
) -> data.LanguagePairDataset:
    return data.LanguagePairDataset(
        src=indexed_dataset.IndexedRawTextDataset(
            path=source_text_file,
            dictionary=source_dict,
            append_eos=append_eos,
            reverse_order=reverse_source,
        ),
        dst=indexed_dataset.IndexedRawTextDataset(
            path=target_text_file,
            dictionary=target_dict,
            # We always append EOS to the target sentence since we still want
            # the model to output an indication the sentence has finished, even
            # if we don't append the EOS symbol to the source sentence
            # (to prevent the model from misaligning UNKs or other words
            # to the frequently occurring EOS).
            append_eos=True,
            # We don't reverse the order of the target sentence, since
            # even if the source sentence is fed to the model backwards,
            # we still want the model to start outputting from the first word.
            reverse_order=False,
        ),
        pad_idx=source_dict.pad(),
        eos_idx=source_dict.eos(),
    )
Пример #3
0
def make_language_pair_dataset_from_text_multilingual(
    source_text_file: str,
    target_text_file: str,
    source_lang_id: int,
    target_lang_id: int,
    source_dict: pytorch_translate_dictionary.Dictionary,
    target_dict: pytorch_translate_dictionary.Dictionary,
    append_eos: Optional[bool] = False,
    reverse_source: Optional[bool] = True,
) -> data.LanguagePairDataset:
    return data.LanguagePairDataset(
        src=IndexedRawTextDatasetWithLangId(
            path=source_text_file,
            dictionary=source_dict,
            lang_id=source_lang_id,
            append_eos=append_eos,
            reverse_order=reverse_source,
            prepend_language_id=False,
        ),
        dst=IndexedRawTextDatasetWithLangId(
            path=target_text_file,
            dictionary=target_dict,
            lang_id=target_lang_id,
            append_eos=True,
            reverse_order=False,
            prepend_language_id=True,
        ),
        pad_idx=source_dict.pad(),
        eos_idx=source_dict.eos(),
    )
Пример #4
0
def build_vocabs(args: argparse.Namespace):
    """
    Builds vocabs or loads them from existing vocab files. If args.task
    is pytorch_translate_semi_supervised, we use the monolingual corpora in
    addition to the parallel corpora for building source and target vocabs.
    """
    source_files = [args.train_source_text_file]
    target_files = [args.train_target_text_file]

    if args.task == constants.SEMI_SUPERVISED_TASK and getattr(
        args, "add_monolingual_data_for_vocab_building", None
    ):
        if getattr(args, "train_mono_source_text_file", None):
            source_files.append(args.train_mono_source_text_file)
        if getattr(args, "train_mono_target_text_file", None):
            target_files.append(args.train_mono_target_text_file)

    source_dict = Dictionary.build_vocab_file_if_nonexistent(
        corpus_files=source_files,
        vocab_file=args.source_vocab_file,
        max_vocab_size=args.source_max_vocab_size,
        tokens_with_penalty=None,
    )
    use_char_source = (args.char_source_vocab_file != "") or (
        getattr(args, "arch", "") == "char_source"
    )
    char_source_dict = None
    if use_char_source:
        embed_bytes = getattr(args, "embed_bytes", False)
        char_source_dict = Dictionary.build_vocab_file_if_nonexistent(
            corpus_files=source_files,
            vocab_file=args.char_source_vocab_file,
            max_vocab_size=args.char_source_max_vocab_size,
            tokens_with_penalty=None,
            is_char_vocab=True,
            embed_bytes=embed_bytes,
        )

    target_dict = Dictionary.build_vocab_file_if_nonexistent(
        corpus_files=target_files,
        vocab_file=args.target_vocab_file,
        max_vocab_size=args.target_max_vocab_size,
        tokens_with_penalty=args.penalized_target_tokens_file,
    )
    return source_dict, char_source_dict, target_dict
Пример #5
0
def build_vocab_multicorpus(
    corpus_langs,
    corpus_files,
    vocab_langs,
    vocab_files,
    max_vocab_size,
    tokens_with_penalty=None,
):
    lang2corpus = {lang: [] for lang in vocab_langs}
    for lang, corpus_file in zip(corpus_langs, corpus_files):
        lang2corpus[lang].append(corpus_file)
    return {
        lang: Dictionary.build_vocab_file_if_nonexistent(
            corpus_files=lang2corpus[lang],
            vocab_file=vocab_file,
            max_vocab_size=max_vocab_size,
            tokens_with_penalty=tokens_with_penalty,
        )
        for lang, vocab_file in zip(vocab_langs, vocab_files)
    }
Пример #6
0
def preprocess_corpora_bilingual(args):
    source_dict = Dictionary.build_vocab_file_if_nonexistent(
        corpus_files=[args.train_source_text_file],
        vocab_file=args.source_vocab_file,
        max_vocab_size=args.source_max_vocab_size,
        tokens_with_penalty=None,
    )
    use_char_source = (args.char_source_vocab_file != "") or (getattr(
        args, "arch", "") == "char_source")
    char_source_dict = None
    if use_char_source:
        char_source_dict = Dictionary.build_vocab_file_if_nonexistent(
            corpus_files=[args.train_source_text_file],
            vocab_file=args.char_source_vocab_file,
            max_vocab_size=args.char_source_max_vocab_size,
            tokens_with_penalty=None,
            is_char_vocab=True,
        )
    if args.train_source_text_file:
        args.train_source_binary_path = binarize_text_file(
            text_file=args.train_source_text_file,
            dictionary=source_dict,
            output_path=args.train_source_binary_path,
            append_eos=args.append_eos_to_source,
            reverse_order=args.reverse_source,
            use_char_data=use_char_source,
            char_dictionary=char_source_dict,
        )
    if args.eval_source_text_file:
        args.eval_source_binary_path = binarize_text_file(
            text_file=args.eval_source_text_file,
            dictionary=source_dict,
            output_path=args.eval_source_binary_path,
            append_eos=args.append_eos_to_source,
            reverse_order=args.reverse_source,
            use_char_data=use_char_source,
            char_dictionary=char_source_dict,
        )

    target_dict = Dictionary.build_vocab_file_if_nonexistent(
        corpus_files=[args.train_target_text_file],
        vocab_file=args.target_vocab_file,
        max_vocab_size=args.target_max_vocab_size,
        tokens_with_penalty=args.penalized_target_tokens_file,
    )
    # For target sentences, we always append EOS tokens, and never reverse
    # their order.
    if args.train_target_text_file:
        args.train_target_binary_path = binarize_text_file(
            text_file=args.train_target_text_file,
            dictionary=target_dict,
            output_path=args.train_target_binary_path,
            # We always append EOS to the target sentence since we still want
            # the model to output an indication the sentence has finished, even
            # if we don't append the EOS symbol to the source sentence
            # (to prevent the model from misaligning UNKs or other words
            # to the frequently occurring EOS).
            append_eos=True,
            # We don't reverse the order of the target sentence, since
            # even if the source sentence is fed to the model backwards,
            # we still want the model to start outputting from the first word.
            reverse_order=False,
        )
    if args.eval_target_text_file:
        args.eval_target_binary_path = binarize_text_file(
            text_file=args.eval_target_text_file,
            dictionary=target_dict,
            output_path=args.eval_target_binary_path,
            append_eos=True,
            reverse_order=False,
        )
Пример #7
0
 def __init__(self):
     self.target_dictionary = Dictionary()