def make_language_pair_dataset_from_text( source_text_file: str, target_text_file: str, source_dict: pytorch_translate_dictionary.Dictionary, target_dict: pytorch_translate_dictionary.Dictionary, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, char_source_dict: Optional[pytorch_translate_dictionary.Dictionary] = None, ) -> data.LanguagePairDataset: dst_dataset = indexed_dataset.IndexedRawTextDataset( path=target_text_file, dictionary=target_dict, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ) if char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset() src_dataset.parse( path=source_text_file, word_dict=source_dict, char_dict=char_source_dict, reverse_order=reverse_source, append_eos=append_eos, ) return char_data.LanguagePairSourceCharDataset( src=src_dataset, dst=dst_dataset, pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), ) else: return data.LanguagePairDataset( src=indexed_dataset.IndexedRawTextDataset( path=source_text_file, dictionary=source_dict, append_eos=append_eos, reverse_order=reverse_source, ), dst=dst_dataset, pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), )
def make_language_pair_dataset_from_text_multilingual( source_text_file: str, target_text_file: str, source_lang_id: int, target_lang_id: int, source_dict: pytorch_translate_dictionary.Dictionary, target_dict: pytorch_translate_dictionary.Dictionary, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ) -> data.LanguagePairDataset: return data.LanguagePairDataset( src=IndexedRawTextDatasetWithLangId( path=source_text_file, dictionary=source_dict, lang_id=source_lang_id, append_eos=append_eos, reverse_order=reverse_source, prepend_language_id=False, ), dst=IndexedRawTextDatasetWithLangId( path=target_text_file, dictionary=target_dict, lang_id=target_lang_id, append_eos=True, reverse_order=False, prepend_language_id=True, ), pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), )
def make_language_pair_dataset( source_file: str, target_file: str, source_dict: pytorch_translate_dictionary.Dictionary, target_dict: pytorch_translate_dictionary.Dictionary, args: Optional[argparse.Namespace] = None, ) -> data.LanguagePairDataset: return data.LanguagePairDataset( src=indexed_dataset.IndexedRawTextDataset( path=source_file, dictionary=source_dict, append_eos=args.append_eos_to_source if args is not None else True, reverse_order=args.reverse_source if args is not None else False, ), dst=indexed_dataset.IndexedRawTextDataset( path=target_file, dictionary=target_dict, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ), pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), )
def make_multisource_language_pair_dataset_from_text( source_text_files: List[str], target_text_file: str, source_dict: pytorch_translate_dictionary.Dictionary, target_dict: pytorch_translate_dictionary.Dictionary, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ) -> MultisourceLanguagePairDataset: return MultisourceLanguagePairDataset( src=IndexedRawTextMultisentDataset( path=source_text_files, dictionary=source_dict, append_eos=append_eos, reverse_order=reverse_source, ), dst=indexed_dataset.IndexedRawTextDataset( path=target_text_file, dictionary=target_dict, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ), pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), )