Пример #1
0
    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
        """Load a given dataset split.
        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        dataset_map = OrderedDict()

        for lang in self.langs2id.keys():
            # Datasets are expected to be in "split.lang" format (Eg: train.en)
            language_split = '{}.{}'.format(split, lang)

            block_dataset, sizes = self._load_single_lang_dataset(split=language_split, epoch=epoch)

            dataset_map[lang] = MaskedLMDataset(
                dataset=block_dataset,
                sizes=sizes,
                vocab=self.dictionary,
                pad_idx=self.dictionary.pad(),
                mask_idx=self.dictionary.mask(),
                classif_token_idx=self.dictionary.eos(),
                sep_token_idx=self.dictionary.eos(),
                shuffle=getattr(self.args, 'shuffle', False),
                has_pairs=False,
                segment_id=self.langs2id[lang],
                seed=self.seed,
            )

        self.datasets[split] = MultiCorpusSampledDataset(dataset_map)
        print('| {} {} {} examples'.format(
            self.args.data.split(os.pathsep)[epoch], split, len(self.datasets[split]))
        )
Пример #2
0
    def _load_dataset_multi_path(
        self,
        split: str,
        src_multiple_bin_paths: Dict[str, str],
        tgt_multiple_bin_paths: Dict[str, str],
        dataset_upsampling: Optional[Dict[str, float]] = None,
        dataset_relative_ratio: Optional[Tuple[str, float]] = None,
        seed: Optional[int] = None,
        noiser: Optional[Dict[str, UnsupervisedMTNoising]] = None,
    ):
        corpora_map = pytorch_translate_data.ParallelCorporaMapConfig(
            src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths)
        datasets = OrderedDict()
        for key in corpora_map.src_files:
            src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key]
            src_dataset, tgt_dataset = (
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    src),
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    tgt),
            )
            src_sizes = src_dataset.sizes
            if noiser is not None and key in noiser:
                src_dataset = NoisingDataset(
                    src_dataset=src_dataset,
                    src_dict=self.source_dictionary,
                    seed=seed,
                    noiser=noiser[key],
                )
            datasets[key] = LanguagePairDataset(
                src=src_dataset,
                src_sizes=src_sizes,
                src_dict=self.source_dictionary,
                tgt=tgt_dataset,
                tgt_sizes=tgt_dataset.sizes,
                tgt_dict=self.target_dictionary,
                left_pad_source=False,
            )
        total_line_count = sum(len(datasets[key]) for key in datasets)
        if dataset_relative_ratio is not None:
            ds, ratio = dataset_relative_ratio
            line_count = len(datasets[ds])
            # By definition ratio = u * line_count / sum(#lines of other datasets)
            u = (total_line_count - line_count) / line_count * ratio
            dataset_upsampling = {key: u}

        dataset_weights = {
            key: 1.0 * len(datasets[key]) / total_line_count
            for key in src_multiple_bin_paths.keys()
        }
        if dataset_upsampling is not None:
            for k, v in dataset_upsampling.items():
                dataset_weights[k] *= v
        print(f"|dataset_weights:{dataset_weights}")
        self.datasets[split] = MultiCorpusSampledDataset(
            datasets=datasets,
            default_key=list(dataset_weights.keys())[0],
            sampling_func=self._normalized_weighted_sampling(dataset_weights),
        )
 def _test_sample_helper(
     self,
     expected_sample_from_first_ds_percentage,
     num_samples=1000,
     sampling_func=None,
 ):
     # To make sure test is not flaky
     np.random.seed(0)
     if sampling_func is None:
         m = MultiCorpusSampledDataset(
             OrderedDict({
                 0: self.dataset_1,
                 1: self.dataset_2
             }), )
     else:
         m = MultiCorpusSampledDataset(
             OrderedDict({
                 0: self.dataset_1,
                 1: self.dataset_2
             }),
             sampling_func=sampling_func,
         )
     m.ordered_indices()
     count_sample_from_first_dataset = 0
     for _ in range(num_samples):
         if m.collater([m[0], m[1]])["net_input"]["src_tokens"][0] == 1:
             count_sample_from_first_dataset += 1
     sample_from_first_ds_percentage = (1.0 *
                                        count_sample_from_first_dataset /
                                        num_samples)
     self.assertLess(
         abs(sample_from_first_ds_percentage -
             expected_sample_from_first_ds_percentage),
         0.01,
     )
Пример #4
0
    def load_dataset(self, split, combine=False):
        """Load a given dataset split.
        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        dataset_map = OrderedDict()

        for lang in self.langs2id.keys():
            if self.default_key is None:
                self.default_key = lang
            # Datasets are expected to be in "split.lang" format (Eg: train.en)
            language_split = '{}.{}'.format(split, lang)
            path = os.path.join(self.args.data, language_split)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
            elif not self.args.raw_text and IndexedDataset.exists(path):
                if self.args.lazy_load:
                    ds = IndexedDataset(path, fix_lua_indexing=True)
                else:
                    ds = IndexedCachedDataset(path, fix_lua_indexing=True)
            else:
                raise FileNotFoundError('Dataset not found: {} ({})'.format(
                    language_split, self.args.data))

            # Since we append each block with the classification_token,
            # we need to effectively create blocks of length
            # tokens_per_sample-1
            block_dataset = TokenBlockDataset(
                dataset=ds,
                sizes=ds.sizes,
                block_size=self.args.tokens_per_sample - 1,
                pad=self.dictionary.pad(),
                eos=self.dictionary.eos())

            dataset_map[lang] = MaskedLMDataset(
                dataset=block_dataset,
                sizes=block_dataset.sizes,
                vocab=self.dictionary,
                pad_idx=self.dictionary.pad(),
                mask_idx=self.dictionary.mask(),
                classif_token_idx=self.dictionary.eos(),
                sep_token_idx=self.dictionary.eos(),
                shuffle=getattr(self.args, 'shuffle', False),
                has_pairs=False,
                segment_id=self.langs2id[lang],
                seed=self.seed,
            )

        self.datasets[split] = MultiCorpusSampledDataset(
            dataset_map, default_key=self.default_key)
        print('| {} {} {} examples'.format(self.args.data, split,
                                           len(self.datasets[split])))
    def _load_dataset_multi_path(
        self,
        split: str,
        src_multiple_bin_paths: Dict[str, str],
        tgt_multiple_bin_paths: Dict[str, str],
        dataset_upsampling: Optional[Dict[str, float]],
    ):
        corpora_map = pytorch_translate_data.ParallelCorporaMapConfig(
            src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths)
        datasets = OrderedDict()
        for key in corpora_map.src_files:
            src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key]
            src_dataset, tgt_dataset = (
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    src),
                pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    tgt),
            )
            datasets[key] = LanguagePairDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=tgt_dataset,
                tgt_sizes=tgt_dataset.sizes,
                tgt_dict=self.target_dictionary,
                left_pad_source=False,
            )
        dataset_weights = {
            key: 1.0 / len(src_multiple_bin_paths)
            for key in src_multiple_bin_paths.keys()
        }

        if dataset_upsampling is not None:
            for k, v in dataset_upsampling.items():
                dataset_weights[k] *= v

        self.datasets[split] = MultiCorpusSampledDataset(
            datasets=datasets,
            default_key=list(dataset_weights.keys())[0],
            sampling_func=self._normalized_weighted_sampling(dataset_weights),
        )