def _init_task(self): src_dict = Dictionary() self.symbols = "abcdefghijklmnopqrstuvwxyz" for symbol in self.symbols[:self.n_symbols]: src_dict.add_symbol(symbol) dst_dict = Dictionary() self.task = PytorchTranslateTask(None, src_dict, dst_dict)
def make_language_pair_dataset_from_text( source_text_file: str, target_text_file: str, source_dict: pytorch_translate_dictionary.Dictionary, target_dict: pytorch_translate_dictionary.Dictionary, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ) -> data.LanguagePairDataset: return data.LanguagePairDataset( src=indexed_dataset.IndexedRawTextDataset( path=source_text_file, dictionary=source_dict, append_eos=append_eos, reverse_order=reverse_source, ), dst=indexed_dataset.IndexedRawTextDataset( path=target_text_file, dictionary=target_dict, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ), pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), )
def make_language_pair_dataset_from_text_multilingual( source_text_file: str, target_text_file: str, source_lang_id: int, target_lang_id: int, source_dict: pytorch_translate_dictionary.Dictionary, target_dict: pytorch_translate_dictionary.Dictionary, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ) -> data.LanguagePairDataset: return data.LanguagePairDataset( src=IndexedRawTextDatasetWithLangId( path=source_text_file, dictionary=source_dict, lang_id=source_lang_id, append_eos=append_eos, reverse_order=reverse_source, prepend_language_id=False, ), dst=IndexedRawTextDatasetWithLangId( path=target_text_file, dictionary=target_dict, lang_id=target_lang_id, append_eos=True, reverse_order=False, prepend_language_id=True, ), pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), )
def build_vocabs(args: argparse.Namespace): """ Builds vocabs or loads them from existing vocab files. If args.task is pytorch_translate_semi_supervised, we use the monolingual corpora in addition to the parallel corpora for building source and target vocabs. """ source_files = [args.train_source_text_file] target_files = [args.train_target_text_file] if args.task == constants.SEMI_SUPERVISED_TASK and getattr( args, "add_monolingual_data_for_vocab_building", None ): if getattr(args, "train_mono_source_text_file", None): source_files.append(args.train_mono_source_text_file) if getattr(args, "train_mono_target_text_file", None): target_files.append(args.train_mono_target_text_file) source_dict = Dictionary.build_vocab_file_if_nonexistent( corpus_files=source_files, vocab_file=args.source_vocab_file, max_vocab_size=args.source_max_vocab_size, tokens_with_penalty=None, ) use_char_source = (args.char_source_vocab_file != "") or ( getattr(args, "arch", "") == "char_source" ) char_source_dict = None if use_char_source: embed_bytes = getattr(args, "embed_bytes", False) char_source_dict = Dictionary.build_vocab_file_if_nonexistent( corpus_files=source_files, vocab_file=args.char_source_vocab_file, max_vocab_size=args.char_source_max_vocab_size, tokens_with_penalty=None, is_char_vocab=True, embed_bytes=embed_bytes, ) target_dict = Dictionary.build_vocab_file_if_nonexistent( corpus_files=target_files, vocab_file=args.target_vocab_file, max_vocab_size=args.target_max_vocab_size, tokens_with_penalty=args.penalized_target_tokens_file, ) return source_dict, char_source_dict, target_dict
def build_vocab_multicorpus( corpus_langs, corpus_files, vocab_langs, vocab_files, max_vocab_size, tokens_with_penalty=None, ): lang2corpus = {lang: [] for lang in vocab_langs} for lang, corpus_file in zip(corpus_langs, corpus_files): lang2corpus[lang].append(corpus_file) return { lang: Dictionary.build_vocab_file_if_nonexistent( corpus_files=lang2corpus[lang], vocab_file=vocab_file, max_vocab_size=max_vocab_size, tokens_with_penalty=tokens_with_penalty, ) for lang, vocab_file in zip(vocab_langs, vocab_files) }
def preprocess_corpora_bilingual(args): source_dict = Dictionary.build_vocab_file_if_nonexistent( corpus_files=[args.train_source_text_file], vocab_file=args.source_vocab_file, max_vocab_size=args.source_max_vocab_size, tokens_with_penalty=None, ) use_char_source = (args.char_source_vocab_file != "") or (getattr( args, "arch", "") == "char_source") char_source_dict = None if use_char_source: char_source_dict = Dictionary.build_vocab_file_if_nonexistent( corpus_files=[args.train_source_text_file], vocab_file=args.char_source_vocab_file, max_vocab_size=args.char_source_max_vocab_size, tokens_with_penalty=None, is_char_vocab=True, ) if args.train_source_text_file: args.train_source_binary_path = binarize_text_file( text_file=args.train_source_text_file, dictionary=source_dict, output_path=args.train_source_binary_path, append_eos=args.append_eos_to_source, reverse_order=args.reverse_source, use_char_data=use_char_source, char_dictionary=char_source_dict, ) if args.eval_source_text_file: args.eval_source_binary_path = binarize_text_file( text_file=args.eval_source_text_file, dictionary=source_dict, output_path=args.eval_source_binary_path, append_eos=args.append_eos_to_source, reverse_order=args.reverse_source, use_char_data=use_char_source, char_dictionary=char_source_dict, ) target_dict = Dictionary.build_vocab_file_if_nonexistent( corpus_files=[args.train_target_text_file], vocab_file=args.target_vocab_file, max_vocab_size=args.target_max_vocab_size, tokens_with_penalty=args.penalized_target_tokens_file, ) # For target sentences, we always append EOS tokens, and never reverse # their order. if args.train_target_text_file: args.train_target_binary_path = binarize_text_file( text_file=args.train_target_text_file, dictionary=target_dict, output_path=args.train_target_binary_path, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ) if args.eval_target_text_file: args.eval_target_binary_path = binarize_text_file( text_file=args.eval_target_text_file, dictionary=target_dict, output_path=args.eval_target_binary_path, append_eos=True, reverse_order=False, )
def __init__(self): self.target_dictionary = Dictionary()