Exemplos de IndexedRawTextDataset em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: fairseq.data

Método / Função: IndexedRawTextDataset

Exemplos em hotexamples.com: 2

IndexedRawTextDataset em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de fairseq.data.IndexedRawTextDataset em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Exemplo n.º 1

0

Exibir arquivo

Arquivo: pytorch_translate_task.py Projeto: florischabert/translate

def load_dataset_from_text( self, split: str, source_text_file: str, target_text_file: str, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ): dst_dataset = data.IndexedRawTextDataset( path=target_text_file, dictionary=self.target_dictionary, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset() src_dataset.parse( path=source_text_file, word_dict=self.source_dictionary, char_dict=self.char_source_dict, reverse_order=reverse_source, append_eos=append_eos, ) self.datasets[split] = char_data.LanguagePairSourceCharDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, ) else: src_dataset = data.IndexedRawTextDataset( path=source_text_file, dictionary=self.source_dictionary, append_eos=append_eos, reverse_order=reverse_source, ) self.datasets[split] = data.LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, ) print(f"| {split} {len(self.datasets[split])} examples")

Exemplo n.º 2

0

Exibir arquivo

def load_multisource_dataset_from_text( self, split: str, source_text_files: List[str], target_text_file: str, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ): src_dataset = multisource_data.IndexedRawTextMultisentDataset( path=source_text_files, dictionary=self.source_dictionary, append_eos=append_eos, reverse_order=reverse_source, ) dst_dataset = data.IndexedRawTextDataset( path=target_text_file, dictionary=self.target_dictionary, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ) self.datasets[split] = multisource_data.MultisourceLanguagePairDataset( src_dataset, src_dataset.sizes, self.source_dictionary, dst_dataset, dst_dataset.sizes, self.target_dictionary, )