def load_dataset(self, split, src_bin_path, tgt_bin_path, weights_file=None): corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=self.args.source_lang, data_file=src_bin_path), target=pytorch_translate_data.CorpusConfig( dialect=self.args.target_lang, data_file=tgt_bin_path), weights_file=weights_file, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) if not os.path.exists(corpus.source.data_file): raise ValueError( f"{corpus.source.data_file} for {split} not found!") if not os.path.exists(corpus.target.data_file): raise ValueError( f"{corpus.target.data_file} for {split} not found!") dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) weights_dataset = None if corpus.weights_file and os.path.exists(corpus.weights_file): weights_dataset = weighted_data.IndexedWeightsDataset( corpus.weights_file) assert len(dst_dataset) == len(weights_dataset) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file) self.datasets[split] = char_data.LanguagePairSourceCharDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, weights=weights_dataset, ) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) self.datasets[split] = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, weights=weights_dataset, ) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} examples")
def load_dataset(self, split, src_bin_path, tgt_bin_path, weights_file=None, is_train=False): """ Currently this method does not support character models. """ corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=self.args.source_lang, data_file=src_bin_path), target=pytorch_translate_data.CorpusConfig( dialect=self.args.target_lang, data_file=tgt_bin_path), weights_file=weights_file, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) if is_train: self.datasets[split] = TeacherDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.src_dict, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.tgt_dict, teacher_models=self.teacher_models, top_k_teacher_tokens=self.top_k_teacher_tokens, top_k_teacher_scores=self.top_k_teacher_scores, top_k_teacher_indices=self.top_k_teacher_indices, left_pad_source=False, ) else: self.datasets[split] = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.src_dict, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.tgt_dict, weights=None, left_pad_source=False, ) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} examples")
def load_parallel_dataset( source_lang, target_lang, src_bin_path, tgt_bin_path, source_dictionary, target_dictionary, split, remove_eos_from_source, append_eos_to_target=True, char_source_dict=None, log_verbose=True, ): corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=source_lang, data_file=src_bin_path ), target=pytorch_translate_data.CorpusConfig( dialect=target_lang, data_file=tgt_bin_path ), weights_file=None, ) if log_verbose: print("Starting to load binarized data files.", flush=True) validate_corpus_exists(corpus=corpus, split=split) tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file ) if char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file ) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file ) parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=target_dictionary, remove_eos_from_source=remove_eos_from_source, append_eos_to_target=append_eos_to_target, ) return parallel_dataset, src_dataset, tgt_dataset
def load_dataset(self, split, src_bin_path, tgt_bin_path, seed=None, use_noiser=False): """ Load a dataset split. Seed and noiser are only used for loading train data, not eval data. """ parallel_dataset, src_dataset, tgt_dataset = data_utils.load_parallel_dataset( source_lang=self.source_lang, target_lang=self.target_lang, src_bin_path=src_bin_path, tgt_bin_path=tgt_bin_path, source_dictionary=self.source_dictionary, target_dictionary=self.target_dictionary, split=split, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, char_source_dict=self.char_source_dict, log_verbose=self.args.log_verbose, ) dataset_map = OrderedDict([(f"{self.source_lang}-{self.target_lang}", parallel_dataset)]) if use_noiser: if getattr(self.args, "denoising_source_parallel", False): dataset_map[( f"{self.source_lang}-{self.source_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=src_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.source_noiser, ), tgt=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_parallel", False): dataset_map[( f"{self.target_lang}-{self.target_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=tgt_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.target_noiser, ), tgt=tgt_dataset, src_sizes=tgt_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_source_mono", False): source_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_source_binary_path) dataset_map[( f"{self.source_lang}-{self.source_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=source_mono_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.source_noiser, ), tgt=source_mono_dataset, src_sizes=source_mono_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_mono", False): target_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_target_binary_path) dataset_map[( f"{self.target_lang}-{self.target_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=target_mono_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.target_noiser, ), tgt=target_mono_dataset, src_sizes=target_mono_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) self.datasets[split] = RoundRobinZipDatasets(dataset_map) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} datasets")
def load_dataset( self, split, src_bin_path, tgt_bin_path, forward_model=None, backward_model=None ): """Load a dataset split.""" corpus = ptt_data.ParallelCorpusConfig( source=ptt_data.CorpusConfig( dialect=self.source_lang, data_file=src_bin_path ), target=ptt_data.CorpusConfig( dialect=self.target_lang, data_file=tgt_bin_path ), weights_file=None, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) forward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file ) backward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file ) forward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file ) backward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file ) forward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=forward_src_dataset, src_sizes=forward_src_dataset.sizes, src_dict=self.source_dictionary, tgt=forward_tgt_dataset, tgt_sizes=forward_tgt_dataset.sizes, tgt_dict=self.target_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) backward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=backward_src_dataset, src_sizes=backward_src_dataset.sizes, src_dict=self.target_dictionary, tgt=backward_tgt_dataset, tgt_sizes=backward_tgt_dataset.sizes, tgt_dict=self.source_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) dataset_map = OrderedDict( [ (f"{self.source_lang}-{self.target_lang}", forward_parallel_dataset), (f"{self.target_lang}-{self.source_lang}", backward_parallel_dataset), ] ) assert (forward_model and backward_model) or ( forward_model is None and backward_model is None ), ( "Only one of forward or backward models can't be null;" " both have to be non-null or null" ) if forward_model and backward_model: fwd_generator = beam_decode.SequenceGenerator( models=[forward_model], tgt_dict=self.source_dictionary ) bwd_generator = beam_decode.SequenceGenerator( models=[backward_model], tgt_dict=self.target_dictionary ) def monolingual_dataset( path, dictionary, is_source=False, num_examples_limit: Optional[int] = None, ): dataset = self.load_monolingual_dataset( path, is_source=is_source, num_examples_limit=num_examples_limit ) return LanguagePairDataset( src=dataset, src_sizes=dataset.sizes, src_dict=dictionary, tgt=None, tgt_sizes=None, tgt_dict=None, ) monolingual_num_examples_limit = None if self.args.monolingual_ratio is not None: monolingual_num_examples_limit = int( self.args.monolingual_ratio * len(forward_parallel_dataset) ) src_dataset = monolingual_dataset( path=self.args.train_mono_source_binary_path, dictionary=self.source_dictionary, is_source=True, num_examples_limit=monolingual_num_examples_limit, ) tgt_dataset = monolingual_dataset( path=self.args.train_mono_target_binary_path, dictionary=self.target_dictionary, is_source=False, num_examples_limit=monolingual_num_examples_limit, ) dataset_map[ f"{self.source_lang}-" f"{self.target_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}" ] = BacktranslationDataset( tgt_dataset=TransformEosDataset( dataset=tgt_dataset, eos=self.target_dictionary.eos(), # Remove EOS from the input before backtranslation. remove_eos_from_src=True, ), backtranslation_fn=bwd_generator.generate, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, output_collater=TransformEosDataset( dataset=tgt_dataset, eos=self.target_dictionary.eos(), # The original input (now the target) doesn't have # an EOS, so we need to add one. The generated # backtranslation (now the source) will have an EOS, # so we want to remove it. append_eos_to_tgt=True, remove_eos_from_src=True, ).collater, ) dataset_map[ f"{self.target_lang}-" f"{self.source_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}" ] = BacktranslationDataset( tgt_dataset=src_dataset, backtranslation_fn=fwd_generator.generate, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, output_collater=TransformEosDataset( dataset=src_dataset, eos=self.source_dictionary.eos(), # The original input (now the target) doesn't have # an EOS, so we need to add one. The generated # backtranslation (now the source) will have an EOS, # so we want to remove it. append_eos_to_tgt=True, remove_eos_from_src=True, ).collater, ) # print before loading RoundRobinZipDatasets to help catch any bugs for dataset_key, dataset in dataset_map.items(): print(f"| {split}: {dataset_key} {len(dataset)} examples in dataset") self.datasets[split] = RoundRobinZipDatasets(dataset_map) print( f"| {split} {len(self.datasets[split])} examples in RoundRobinZipDatasets" ) if self.args.log_verbose: print("Finished loading dataset", flush=True)
def load_binarized_dataset( train_corpus: ParallelCorpusConfig, eval_corpus: ParallelCorpusConfig, train_split: str, eval_split: str, args: argparse.Namespace, use_char_source: bool = False, ) -> data.LanguageDatasets: if is_multilingual(args): # Dummy dictionaries source_dict = pytorch_translate_dictionary.Dictionary() target_dict = pytorch_translate_dictionary.Dictionary() else: source_dict = pytorch_translate_dictionary.Dictionary.load( args.source_vocab_file) target_dict = pytorch_translate_dictionary.Dictionary.load( args.target_vocab_file) if use_char_source: char_source_dict = pytorch_translate_dictionary.Dictionary.load( args.char_source_vocab_file) # this attribute is used for CharSourceModel construction args.char_source_dict_size = len(char_source_dict) dataset = data.LanguageDatasets( src=train_corpus.source.dialect, dst=train_corpus.target.dialect, src_dict=source_dict, dst_dict=target_dict, ) for split, corpus in [(train_split, train_corpus), (eval_split, eval_corpus)]: if not os.path.exists(corpus.source.data_file): raise ValueError( f"{corpus.source.data_file} for {split} not found!") if not os.path.exists(corpus.target.data_file): raise ValueError( f"{corpus.target.data_file} for {split} not found!") dst_dataset = InMemoryNumpyDataset.create_from_file( corpus.target.data_file) weights_dataset = None if corpus.weights_file and os.path.exists(corpus.weights_file): weights_dataset = weighted_data.IndexedWeightsDataset( corpus.weights_file) assert len(dst_dataset) == len(weights_dataset) if use_char_source: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file) dataset.splits[split] = char_data.LanguagePairSourceCharDataset( src=src_dataset, dst=dst_dataset, pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), weights=weights_dataset, ) else: src_dataset = InMemoryNumpyDataset.create_from_file( corpus.source.data_file) dataset.splits[split] = weighted_data.WeightedLanguagePairDataset( src=src_dataset, dst=dst_dataset, pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), weights=weights_dataset, ) return dataset
def load_dataset(self, split, src_bin_path, tgt_bin_path, seed=None, noiser=None): """ Load a dataset split. Seed and noiser are only used for loading train data, not eval data. """ corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=self.source_lang, data_file=src_bin_path), target=pytorch_translate_data.CorpusConfig( dialect=self.target_lang, data_file=tgt_bin_path), weights_file=None, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) dataset_map = OrderedDict([(f"{self.source_lang}-{self.target_lang}", parallel_dataset)]) if noiser is not None: if getattr(self.args, "denoising_source_parallel", False): dataset_map[( f"{self.source_lang}-{self.source_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=src_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.noiser, ), tgt=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_parallel", False): dataset_map[( f"{self.target_lang}-{self.target_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=tgt_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.noiser, ), tgt=tgt_dataset, src_sizes=tgt_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_source_mono", False): source_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_source_binary_path) dataset_map[( f"{self.source_lang}-{self.source_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=source_mono_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.noiser, ), tgt=source_mono_dataset, src_sizes=source_mono_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_mono", False): target_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_target_binary_path) dataset_map[( f"{self.target_lang}-{self.target_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=target_mono_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.noiser, ), tgt=target_mono_dataset, src_sizes=target_mono_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) self.datasets[split] = RoundRobinZipDatasets(dataset_map) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} datasets")
def load_dataset(self, split, src_bin_path, tgt_bin_path, forward_model=None, backward_model=None): """Load a dataset split.""" corpus = ptt_data.ParallelCorpusConfig( source=ptt_data.CorpusConfig(dialect=self.source_lang, data_file=src_bin_path), target=ptt_data.CorpusConfig(dialect=self.target_lang, data_file=tgt_bin_path), weights_file=None, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) forward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) backward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) forward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) backward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) forward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=forward_src_dataset, src_sizes=forward_src_dataset.sizes, src_dict=self.source_dictionary, tgt=forward_tgt_dataset, tgt_sizes=forward_tgt_dataset.sizes, tgt_dict=self.target_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) backward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=backward_src_dataset, src_sizes=backward_src_dataset.sizes, src_dict=self.target_dictionary, tgt=backward_tgt_dataset, tgt_sizes=backward_tgt_dataset.sizes, tgt_dict=self.source_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) dataset_map = OrderedDict([ (f"{self.source_lang}-{self.target_lang}", forward_parallel_dataset), (f"{self.target_lang}-{self.source_lang}", backward_parallel_dataset), ]) assert (forward_model and backward_model) or ( forward_model is None and backward_model is None), ( "Only one of forward or backward models can't be null;" " both have to be non-null or null") if forward_model and backward_model: dataset_map[ f"{self.source_lang}-" f"{self.target_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}"] = BacktranslationDataset( tgt_dataset=self.load_monolingual_dataset( self.args.train_mono_target_binary_path), tgt_dict=self.target_dictionary, backtranslation_model=backward_model, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, remove_eos_at_src=True, generator_class=beam_decode.SequenceGenerator, ) dataset_map[ f"{self.target_lang}-" f"{self.source_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}"] = BacktranslationDataset( tgt_dataset=self.load_monolingual_dataset( self.args.train_mono_source_binary_path), tgt_dict=self.source_dictionary, backtranslation_model=forward_model, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, remove_eos_at_src=True, generator_class=beam_decode.SequenceGenerator, ) self.datasets[split] = RoundRobinZipDatasets(dataset_map) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} datasets")