def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ dataset_map = OrderedDict() for lang in self.langs2id.keys(): # Datasets are expected to be in "split.lang" format (Eg: train.en) language_split = '{}.{}'.format(split, lang) block_dataset, sizes = self._load_single_lang_dataset(split=language_split, epoch=epoch) dataset_map[lang] = MaskedLMDataset( dataset=block_dataset, sizes=sizes, vocab=self.dictionary, pad_idx=self.dictionary.pad(), mask_idx=self.dictionary.mask(), classif_token_idx=self.dictionary.eos(), sep_token_idx=self.dictionary.eos(), shuffle=getattr(self.args, 'shuffle', False), has_pairs=False, segment_id=self.langs2id[lang], seed=self.seed, ) self.datasets[split] = MultiCorpusSampledDataset(dataset_map) print('| {} {} {} examples'.format( self.args.data.split(os.pathsep)[epoch], split, len(self.datasets[split])) )
def _load_dataset_multi_path( self, split: str, src_multiple_bin_paths: Dict[str, str], tgt_multiple_bin_paths: Dict[str, str], dataset_upsampling: Optional[Dict[str, float]] = None, dataset_relative_ratio: Optional[Tuple[str, float]] = None, seed: Optional[int] = None, noiser: Optional[Dict[str, UnsupervisedMTNoising]] = None, ): corpora_map = pytorch_translate_data.ParallelCorporaMapConfig( src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths) datasets = OrderedDict() for key in corpora_map.src_files: src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key] src_dataset, tgt_dataset = ( pytorch_translate_data.InMemoryNumpyDataset.create_from_file( src), pytorch_translate_data.InMemoryNumpyDataset.create_from_file( tgt), ) src_sizes = src_dataset.sizes if noiser is not None and key in noiser: src_dataset = NoisingDataset( src_dataset=src_dataset, src_dict=self.source_dictionary, seed=seed, noiser=noiser[key], ) datasets[key] = LanguagePairDataset( src=src_dataset, src_sizes=src_sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, ) total_line_count = sum(len(datasets[key]) for key in datasets) if dataset_relative_ratio is not None: ds, ratio = dataset_relative_ratio line_count = len(datasets[ds]) # By definition ratio = u * line_count / sum(#lines of other datasets) u = (total_line_count - line_count) / line_count * ratio dataset_upsampling = {key: u} dataset_weights = { key: 1.0 * len(datasets[key]) / total_line_count for key in src_multiple_bin_paths.keys() } if dataset_upsampling is not None: for k, v in dataset_upsampling.items(): dataset_weights[k] *= v print(f"|dataset_weights:{dataset_weights}") self.datasets[split] = MultiCorpusSampledDataset( datasets=datasets, default_key=list(dataset_weights.keys())[0], sampling_func=self._normalized_weighted_sampling(dataset_weights), )
def _test_sample_helper( self, expected_sample_from_first_ds_percentage, num_samples=1000, sampling_func=None, ): # To make sure test is not flaky np.random.seed(0) if sampling_func is None: m = MultiCorpusSampledDataset( OrderedDict({ 0: self.dataset_1, 1: self.dataset_2 }), ) else: m = MultiCorpusSampledDataset( OrderedDict({ 0: self.dataset_1, 1: self.dataset_2 }), sampling_func=sampling_func, ) m.ordered_indices() count_sample_from_first_dataset = 0 for _ in range(num_samples): if m.collater([m[0], m[1]])["net_input"]["src_tokens"][0] == 1: count_sample_from_first_dataset += 1 sample_from_first_ds_percentage = (1.0 * count_sample_from_first_dataset / num_samples) self.assertLess( abs(sample_from_first_ds_percentage - expected_sample_from_first_ds_percentage), 0.01, )
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ dataset_map = OrderedDict() for lang in self.langs2id.keys(): if self.default_key is None: self.default_key = lang # Datasets are expected to be in "split.lang" format (Eg: train.en) language_split = '{}.{}'.format(split, lang) path = os.path.join(self.args.data, language_split) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedDataset.exists(path): if self.args.lazy_load: ds = IndexedDataset(path, fix_lua_indexing=True) else: ds = IndexedCachedDataset(path, fix_lua_indexing=True) else: raise FileNotFoundError('Dataset not found: {} ({})'.format( language_split, self.args.data)) # Since we append each block with the classification_token, # we need to effectively create blocks of length # tokens_per_sample-1 block_dataset = TokenBlockDataset( dataset=ds, sizes=ds.sizes, block_size=self.args.tokens_per_sample - 1, pad=self.dictionary.pad(), eos=self.dictionary.eos()) dataset_map[lang] = MaskedLMDataset( dataset=block_dataset, sizes=block_dataset.sizes, vocab=self.dictionary, pad_idx=self.dictionary.pad(), mask_idx=self.dictionary.mask(), classif_token_idx=self.dictionary.eos(), sep_token_idx=self.dictionary.eos(), shuffle=getattr(self.args, 'shuffle', False), has_pairs=False, segment_id=self.langs2id[lang], seed=self.seed, ) self.datasets[split] = MultiCorpusSampledDataset( dataset_map, default_key=self.default_key) print('| {} {} {} examples'.format(self.args.data, split, len(self.datasets[split])))
def _load_dataset_multi_path( self, split: str, src_multiple_bin_paths: Dict[str, str], tgt_multiple_bin_paths: Dict[str, str], dataset_upsampling: Optional[Dict[str, float]], ): corpora_map = pytorch_translate_data.ParallelCorporaMapConfig( src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths) datasets = OrderedDict() for key in corpora_map.src_files: src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key] src_dataset, tgt_dataset = ( pytorch_translate_data.InMemoryNumpyDataset.create_from_file( src), pytorch_translate_data.InMemoryNumpyDataset.create_from_file( tgt), ) datasets[key] = LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, ) dataset_weights = { key: 1.0 / len(src_multiple_bin_paths) for key in src_multiple_bin_paths.keys() } if dataset_upsampling is not None: for k, v in dataset_upsampling.items(): dataset_weights[k] *= v self.datasets[split] = MultiCorpusSampledDataset( datasets=datasets, default_key=list(dataset_weights.keys())[0], sampling_func=self._normalized_weighted_sampling(dataset_weights), )