def load_dataset_from_text(
        self,
        split: str,
        source_text_file: str,
        target_text_file: str,
        append_eos: Optional[bool] = False,
        reverse_source: Optional[bool] = True,
    ):
        dst_dataset = data.IndexedRawTextDataset(
            path=target_text_file,
            dictionary=self.target_dictionary,
            # We always append EOS to the target sentence since we still want
            # the model to output an indication the sentence has finished, even
            # if we don't append the EOS symbol to the source sentence
            # (to prevent the model from misaligning UNKs or other words
            # to the frequently occurring EOS).
            append_eos=True,
            # We don't reverse the order of the target sentence, since
            # even if the source sentence is fed to the model backwards,
            # we still want the model to start outputting from the first word.
            reverse_order=False,
        )

        if self.char_source_dict is not None:
            src_dataset = char_data.InMemoryNumpyWordCharDataset()
            src_dataset.parse(
                path=source_text_file,
                word_dict=self.source_dictionary,
                char_dict=self.char_source_dict,
                reverse_order=reverse_source,
                append_eos=append_eos,
            )
            self.datasets[split] = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
            )
        else:
            src_dataset = data.IndexedRawTextDataset(
                path=source_text_file,
                dictionary=self.source_dictionary,
                append_eos=append_eos,
                reverse_order=reverse_source,
            )
            self.datasets[split] = data.LanguagePairDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
                left_pad_source=False,
            )

        print(f"| {split} {len(self.datasets[split])} examples")
示例#2
0
    def _load_dataset_single_path(
        self,
        split: str,
        src_bin_path: str,
        tgt_bin_path: str,
        weights_file=None,
        is_npz=True,
    ):
        corpus = pytorch_translate_data.ParallelCorpusConfig(
            source=pytorch_translate_data.CorpusConfig(
                dialect=self.args.source_lang, data_file=src_bin_path),
            target=pytorch_translate_data.CorpusConfig(
                dialect=self.args.target_lang, data_file=tgt_bin_path),
            weights_file=weights_file,
        )

        if self.args.log_verbose:
            print("Starting to load binarized data files.", flush=True)
        data_utils.validate_corpus_exists(corpus=corpus,
                                          split=split,
                                          is_npz=is_npz)

        dst_dataset = pytorch_translate_data.InMemoryIndexedDataset.create_from_file(
            corpus.target.data_file, is_npz=is_npz)
        if getattr(self.args, "reverse_target", None):
            dst_dataset.reverse()
        weights_dataset = None
        if corpus.weights_file and os.path.exists(corpus.weights_file):
            weights_dataset = weighted_data.IndexedWeightsDataset(
                corpus.weights_file)
            assert len(dst_dataset) == len(weights_dataset)

        if self.char_source_dict is not None:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                corpus.source.data_file)
            self.datasets[split] = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
                weights=weights_dataset,
            )
        else:
            src_dataset = pytorch_translate_data.InMemoryIndexedDataset.create_from_file(
                corpus.source.data_file, is_npz=is_npz)
            self.datasets[split] = LanguagePairDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
                left_pad_source=False,
            )
示例#3
0
    def load_dataset(self,
                     split,
                     src_bin_path,
                     tgt_bin_path,
                     weights_file=None):
        corpus = pytorch_translate_data.ParallelCorpusConfig(
            source=pytorch_translate_data.CorpusConfig(
                dialect=self.args.source_lang, data_file=src_bin_path),
            target=pytorch_translate_data.CorpusConfig(
                dialect=self.args.target_lang, data_file=tgt_bin_path),
            weights_file=weights_file,
        )

        if self.args.log_verbose:
            print("Starting to load binarized data files.", flush=True)
        data_utils.validate_corpus_exists(corpus=corpus, split=split)

        dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file)
        weights_dataset = None
        if corpus.weights_file and os.path.exists(corpus.weights_file):
            weights_dataset = weighted_data.IndexedWeightsDataset(
                corpus.weights_file)
            assert len(dst_dataset) == len(weights_dataset)

        if self.char_source_dict is not None:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                corpus.source.data_file)
            self.datasets[split] = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
                weights=weights_dataset,
            )
        else:
            src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                corpus.source.data_file)
            self.datasets[split] = weighted_data.WeightedLanguagePairDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
                weights=weights_dataset,
                left_pad_source=False,
            )

        if self.args.log_verbose:
            print("Finished loading dataset", flush=True)

        print(f"| {split} {len(self.datasets[split])} examples")
    def _load_dataset_multi_path_helper(
        self,
        split: str,
        src_multiple_bin_paths: Dict[str, str],
        tgt_multiple_bin_paths: Dict[str, str],
        dataset_upsampling: Optional[Dict[str, float]] = None,
        dataset_relative_ratio: Optional[Tuple[str, float]] = None,
        seed: Optional[int] = None,
        noiser: Optional[Dict[str, UnsupervisedMTNoising]] = None,
    ):
        corpora_map = pytorch_translate_data.ParallelCorporaMapConfig(
            src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths)
        datasets = OrderedDict()
        for key in corpora_map.src_files:
            src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key]
            tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                tgt)

            if self.char_source_dict is not None:
                src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                    src)

            else:
                src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                    src)
            src_sizes = src_dataset.sizes
            if noiser is not None and key in noiser:
                src_dataset = NoisingDataset(
                    src_dataset=src_dataset,
                    src_dict=self.source_dictionary,
                    seed=seed,
                    noiser=noiser[key],
                )
            if self.char_source_dict is not None:
                datasets[key] = char_data.LanguagePairSourceCharDataset(
                    src=src_dataset,
                    src_sizes=src_sizes,
                    src_dict=self.source_dictionary,
                    tgt=tgt_dataset,
                    tgt_sizes=tgt_dataset.sizes,
                    tgt_dict=self.target_dictionary,
                )
            else:
                datasets[key] = LanguagePairDataset(
                    src=src_dataset,
                    src_sizes=src_sizes,
                    src_dict=self.source_dictionary,
                    tgt=tgt_dataset,
                    tgt_sizes=tgt_dataset.sizes,
                    tgt_dict=self.target_dictionary,
                    left_pad_source=False,
                )
        total_line_count = sum(len(datasets[key]) for key in datasets)
        if dataset_relative_ratio:
            ds, ratio = dataset_relative_ratio
            line_count = len(datasets[ds])
            # By definition ratio = u * line_count / sum(#lines of other datasets)
            u = (total_line_count - line_count) / line_count * ratio
            dataset_upsampling = {key: u}
        elif not dataset_upsampling:
            dataset_upsampling = {}

        print(f"|dataset upsampling:{dataset_upsampling}")
        ds_list = []
        sample_ratios = []
        for key, val in datasets.items():
            ds_list.append(val)
            sample_ratios.append(int(dataset_upsampling.get(key, 1)))

        self.datasets[split] = LanguagePairUpsamplingDataset(
            datasets=datasets.values(), sample_ratios=sample_ratios)