def _load_dataset_multi_path( self, split: str, src_multiple_bin_paths: Dict[str, str], tgt_multiple_bin_paths: Dict[str, str], dataset_upsampling: Optional[Dict[str, float]] = None, dataset_relative_ratio: Optional[Tuple[str, float]] = None, seed: Optional[int] = None, noiser: Optional[Dict[str, UnsupervisedMTNoising]] = None, ): corpora_map = pytorch_translate_data.ParallelCorporaMapConfig( src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths) datasets = OrderedDict() for key in corpora_map.src_files: src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key] src_dataset, tgt_dataset = ( pytorch_translate_data.InMemoryNumpyDataset.create_from_file( src), pytorch_translate_data.InMemoryNumpyDataset.create_from_file( tgt), ) src_sizes = src_dataset.sizes if noiser is not None and key in noiser: src_dataset = NoisingDataset( src_dataset=src_dataset, src_dict=self.source_dictionary, seed=seed, noiser=noiser[key], ) datasets[key] = LanguagePairDataset( src=src_dataset, src_sizes=src_sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, ) total_line_count = sum(len(datasets[key]) for key in datasets) if dataset_relative_ratio is not None: ds, ratio = dataset_relative_ratio line_count = len(datasets[ds]) # By definition ratio = u * line_count / sum(#lines of other datasets) u = (total_line_count - line_count) / line_count * ratio dataset_upsampling = {key: u} dataset_weights = { key: 1.0 * len(datasets[key]) / total_line_count for key in src_multiple_bin_paths.keys() } if dataset_upsampling is not None: for k, v in dataset_upsampling.items(): dataset_weights[k] *= v print(f"|dataset_weights:{dataset_weights}") self.datasets[split] = MultiCorpusSampledDataset( datasets=datasets, default_key=list(dataset_weights.keys())[0], sampling_func=self._normalized_weighted_sampling(dataset_weights), )
def _load_dataset_multi_path_helper( self, split: str, src_multiple_bin_paths: Dict[str, str], tgt_multiple_bin_paths: Dict[str, str], dataset_upsampling: Optional[Dict[str, float]] = None, dataset_relative_ratio: Optional[Tuple[str, float]] = None, seed: Optional[int] = None, noiser: Optional[Dict[str, UnsupervisedMTNoising]] = None, ): corpora_map = pytorch_translate_data.ParallelCorporaMapConfig( src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths) datasets = OrderedDict() for key in corpora_map.src_files: src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key] src_dataset, tgt_dataset = ( pytorch_translate_data.InMemoryNumpyDataset.create_from_file( src), pytorch_translate_data.InMemoryNumpyDataset.create_from_file( tgt), ) src_sizes = src_dataset.sizes if noiser is not None and key in noiser: src_dataset = NoisingDataset( src_dataset=src_dataset, src_dict=self.source_dictionary, seed=seed, noiser=noiser[key], ) datasets[key] = LanguagePairDataset( src=src_dataset, src_sizes=src_sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, ) total_line_count = sum(len(datasets[key]) for key in datasets) if dataset_relative_ratio: ds, ratio = dataset_relative_ratio line_count = len(datasets[ds]) # By definition ratio = u * line_count / sum(#lines of other datasets) u = (total_line_count - line_count) / line_count * ratio dataset_upsampling = {key: u} elif not dataset_upsampling: dataset_upsampling = {} print(f"|dataset upsampling:{dataset_upsampling}") ds_list = [] sample_ratios = [] for key, val in datasets.items(): ds_list.append(val) sample_ratios.append(dataset_upsampling.get(key, 1.0)) self.datasets[split] = ConcatDataset(datasets=datasets.values(), sample_ratios=sample_ratios)
def _load_dataset_multi_path( self, split: str, src_multiple_bin_paths: Dict[str, str], tgt_multiple_bin_paths: Dict[str, str], dataset_upsampling: Optional[Dict[str, float]], ): corpora_map = pytorch_translate_data.ParallelCorporaMapConfig( src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths) datasets = OrderedDict() for key in corpora_map.src_files: src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key] src_dataset, tgt_dataset = ( pytorch_translate_data.InMemoryNumpyDataset.create_from_file( src), pytorch_translate_data.InMemoryNumpyDataset.create_from_file( tgt), ) datasets[key] = LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, ) dataset_weights = { key: 1.0 / len(src_multiple_bin_paths) for key in src_multiple_bin_paths.keys() } if dataset_upsampling is not None: for k, v in dataset_upsampling.items(): dataset_weights[k] *= v self.datasets[split] = MultiCorpusSampledDataset( datasets=datasets, default_key=list(dataset_weights.keys())[0], sampling_func=self._normalized_weighted_sampling(dataset_weights), )