def load_dataset_from_text( self, split: str, source_text_file: str, target_text_file: str, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ): dst_dataset = data.IndexedRawTextDataset( path=target_text_file, dictionary=self.target_dictionary, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset() src_dataset.parse( path=source_text_file, word_dict=self.source_dictionary, char_dict=self.char_source_dict, reverse_order=reverse_source, append_eos=append_eos, ) self.datasets[split] = char_data.LanguagePairSourceCharDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, ) else: src_dataset = data.IndexedRawTextDataset( path=source_text_file, dictionary=self.source_dictionary, append_eos=append_eos, reverse_order=reverse_source, ) self.datasets[split] = data.LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, ) print(f"| {split} {len(self.datasets[split])} examples")
def load_multisource_dataset_from_text( self, split: str, source_text_files: List[str], target_text_file: str, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ): src_dataset = multisource_data.IndexedRawTextMultisentDataset( path=source_text_files, dictionary=self.source_dictionary, append_eos=append_eos, reverse_order=reverse_source, ) dst_dataset = data.IndexedRawTextDataset( path=target_text_file, dictionary=self.target_dictionary, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ) self.datasets[split] = multisource_data.MultisourceLanguagePairDataset( src_dataset, src_dataset.sizes, self.source_dictionary, dst_dataset, dst_dataset.sizes, self.target_dictionary, )