Пример #1
0
  def testCompileData(self):
    filename = "out"
    filepath = os.path.join(self.tmp_dir, filename)
    translate.compile_data(self.tmp_dir, self.DATASETS, filename)

    count = 0
    for i, example in enumerate(
        text_problems.text2text_txt_iterator(filepath + ".lang1",
                                             filepath + ".lang2")):
      expected = self.data[i]
      self.assertEqual(list(expected), [example["inputs"], example["targets"]])
      count += 1
    self.assertEqual(count, len(self.data))
Пример #2
0
  def testCompileData(self):
    filename = "out"
    filepath = os.path.join(self.tmp_dir, filename)
    translate.compile_data(self.tmp_dir, self.DATASETS, filename)

    count = 0
    for i, example in enumerate(
        text_problems.text2text_txt_iterator(filepath + ".lang1",
                                             filepath + ".lang2")):
      expected = self.data[i]
      self.assertEqual(list(expected), [example["inputs"], example["targets"]])
      count += 1
    self.assertEqual(count, len(self.data))
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = self.source_data_files(dataset_split)
     """
     for item in datasets:
         dummy_file_name = item[0].split('/')[-1]
         create_dummy_tar(tmp_dir, dummy_file_name)
         s_file, t_file = item[1][0], item[1][1]
         if not os.path.exists(os.path.join(tmp_dir, s_file)):
             raise Exception("Be sure file '%s' is exists in tmp dir" % s_file)
         if not os.path.exists(os.path.join(tmp_dir, t_file)):
             raise Exception("Be sure file '%s' is exists in tmp dir" % t_file)
     """
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     source_vocab_filename = os.path.join(data_dir, self.source_vocab_name)
     target_vocab_filename = os.path.join(data_dir, self.target_vocab_name)
     source_encoder = text_encoder.TokenTextEncoder(
         source_vocab_filename, replace_oov=self.oov_token)
     target_encoder = text_encoder.TokenTextEncoder(
         target_vocab_filename, replace_oov=self.oov_token)
     tag = "train" if train else "dev"
     filename_base = "%s-compiled-%s" % (self.name, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + '.lang1',
                                              data_path + '.lang2'),
         source_encoder, target_encoder)
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_dataset = self.get_training_dataset(tmp_dir)
        datasets = train_dataset  #if train else _NC_TEST_DATASETS
        source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
        target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            source_datasets,
            file_byte_budget=1e8)
        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            target_datasets,
            file_byte_budget=1e8)
        tag = "train" if train else "dev"

        filename_base = "%s" % (tag)

        data_path = translate.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".source",
                                                 data_path + ".target"),
            source_vocab, target_vocab)
Пример #5
0
 def generator(self, data_dir, tmp_dir, train):
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = train_dataset if train else _NC_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
     source_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.source_vocab_name,
         self.targeted_vocab_size,
         source_datasets,
         file_byte_budget=1e8)
     target_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.target_vocab_name,
         self.targeted_vocab_size,
         target_datasets,
         file_byte_budget=1e8)
     tag = "train" if train else "dev"
     filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                                data_path + ".lang2",
                                                source_vocab, target_vocab,
                                                EOS)
Пример #6
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
   train = dataset_split == problem.DatasetSplit.TRAIN
   train_dataset = self.get_training_dataset(tmp_dir)
   datasets = train_dataset if train else _NC_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
   target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir,
       tmp_dir,
       self.source_vocab_name,
       self.approx_vocab_size,
       source_datasets,
       file_byte_budget=1e8)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir,
       tmp_dir,
       self.target_vocab_name,
       self.approx_vocab_size,
       target_datasets,
       file_byte_budget=1e8)
   tag = "train" if train else "dev"
   filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
   data_path = translate.compile_data(tmp_dir, datasets, filename_base)
   return text_problems.text2text_generate_encoded(
       text_problems.text2text_txt_iterator(data_path + ".lang1",
                                            data_path + ".lang2"),
       source_vocab, target_vocab)
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = train_dataset if train else _NC_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
     source_vocab = my_spm_utils.get_or_generate_spm(
         data_dir,
         tmp_dir,
         vocab_size=self.approx_vocab_size,
         model_prefix=self.source_vocab_name,
         sources=source_datasets,
         file_byte_budget=1e10)
     target_vocab = my_spm_utils.get_or_generate_spm(
         data_dir,
         tmp_dir,
         vocab_size=int(self.approx_vocab_size / 2),
         model_prefix=self.target_vocab_name,
         sources=target_datasets,
         file_byte_budget=1e10)
     tag = "train" if train else "dev"
     filename_base = "wmt_enja_%sk_tok_%s" % (self.approx_vocab_size, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + ".lang1",
                                              data_path + ".lang2"),
         source_vocab, target_vocab)
Пример #8
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
     tag = "train" if train else "dev"
     vocab_datasets = []
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "czeng57m_encs_tok_%s" % tag)
     # CzEng contains 100 gz files with tab-separated columns, so let's expect
     # it is the first dataset in datasets and use the newly created *.lang{1,2}
     # files for vocab construction.
     if datasets[0][0].endswith("czeng57m.tar"):
         vocab_datasets.append([
             datasets[0][0],
             [
                 "czeng57m_encs_tok_%s.lang1" % tag,
                 "czeng57m_encs_tok_%s.lang2" % tag
             ]
         ])
         datasets = datasets[1:]
     vocab_datasets += [[item[0], [item[1][0], item[1][1]]]
                        for item in datasets]
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         vocab_datasets)
     return translate.token_generator(data_path + ".lang1",
                                      data_path + ".lang2",
                                      symbolizer_vocab, text_encoder.EOS_ID)
Пример #9
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        datasets = self.source_data_files(dataset_split)
        tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"

        # create shared vocabulary
        if self.vocab_type == "subwords":
            data_path = translate.compile_data(
                tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag))
            self.get_or_create_vocab(data_dir, tmp_dir)
            sample_iterator = text_problems.text2text_txt_iterator(
                data_path + ".lang1", data_path + ".lang2")
        elif self.vocab_type == "tokens":
            sample_iterator = super().generate_samples(data_dir, tmp_dir,
                                                       dataset_split)
        else:
            raise ValueError("VocabType not supported")

        # create source feature vocabularies
        data_path = self.compile_sfeat_data(
            tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag))
        self.create_src_feature_vocabs(data_dir, tmp_dir)
        sfeat_iterator = text_problems.txt_line_iterator(data_path + ".sfeat")

        def _generate(sample_iterator, sfeat_iterator):
            for sample in sample_iterator:
                sample["sfeats"] = next(sfeat_iterator)
                yield sample

        return _generate(sample_iterator, sfeat_iterator)
Пример #10
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
   train = dataset_split == problem.DatasetSplit.TRAIN
   datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_encs_chr_%s" % tag)
   return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                               data_path + ".lang2")
Пример #11
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "wmt_encs_chr_%s" % tag)
     return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2")
Пример #12
0
 def generator(self, _, tmp_dir, train):
   character_vocab = text_encoder.ByteTextEncoder()
   datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_ende_chr_%s" % tag)
   return translate.character_generator(
       data_path + ".lang1", data_path + ".lang2", character_vocab, EOS)
 def generator(self, _, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "wmt_ende_chr_%s" % tag)
     return translate.character_generator(data_path + ".lang1",
                                          data_path + ".lang2",
                                          character_vocab, EOS)
Пример #14
0
 def generator(self, data_dir, tmp_dir, train):
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       _ENDE_TRAIN_DATASETS)
   datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_ende_tok_%s" % tag)
   return translate.token_generator(data_path + ".lang1", data_path + ".lang2",
                                    symbolizer_vocab, EOS)
Пример #15
0
 def generator(self, data_dir, tmp_dir, train):
   character_vocab = text_encoder.ByteTextEncoder()
   if self.use_small_dataset:
     datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA
   else:
     datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enfr_chr_%s" % tag)
   return translate.character_generator(
       data_path + ".lang1", data_path + ".lang2", character_vocab, EOS)
 def generator(self, data_dir, tmp_dir, train):
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         _ENDE_TRAIN_DATASETS)
     datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "wmt_ende_tok_%s" % tag)
     return translate.token_generator(data_path + ".lang1",
                                      data_path + ".lang2",
                                      symbolizer_vocab, EOS)
Пример #17
0
 def generator(self, data_dir, tmp_dir, train):
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       _ENFR_TRAIN_SMALL_DATA)
   if self.use_small_dataset:
     datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA
   else:
     datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enfr_tok_%s" % tag)
   return translate.token_generator(data_path + ".lang1", data_path + ".lang2",
                                    symbolizer_vocab, EOS)
Пример #18
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]] for item in datasets]
     target_datasets = [[item[0], [item[1][1]]] for item in datasets]
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         source_datasets + target_datasets)
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "setimes_mken_tok_%s" % tag)
     # We generate English->X data by convention, to train reverse translation
     # just add the "_rev" suffix to the problem name, e.g., like this.
     #   --problems=translate_enmk_setimes32k_rev
     return translate.token_generator(data_path + ".lang2",
                                      data_path + ".lang1",
                                      symbolizer_vocab, EOS)
Пример #19
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_dataset = self.get_training_dataset(tmp_dir)
        datasets = train_dataset if train else _NC_TEST_DATASETS

        source_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.source_vocab_name),
                                                     replace_oov=self.oov_token)
        target_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.target_vocab_name),
                                                     replace_oov=self.oov_token)
        tag = "train" if train else "dev"
        filename_base = "wmt_pdre_tok_%s" % tag
        data_path = translate.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2"),
            source_vocab, target_vocab)
Пример #20
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS]
   target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets)
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enzh_tok_%s" % tag)
   return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                              data_path + ".lang2",
                                              source_vocab, target_vocab, EOS)
Пример #21
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS]
   target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets)
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enzh_tok_%s" % tag)
   return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                              data_path + ".lang2",
                                              source_vocab, target_vocab, EOS)
Пример #22
0
    def generator(self, data_dir, tmp_dir, train):
        datasets = self.get_datasets(train)

        # build vocab from training datasets
        source_datasets = [[item[0], [item[1][0]]] for item in self.get_datasets(train=True)]
        target_datasets = [[item[0], [item[1][1]]] for item in self.get_datasets(train=True)]
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
            source_datasets)
        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
            target_datasets)

        tag = "train" if train else "dev"
        data_path = compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
        return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
                                            source_vocab, target_vocab, EOS)
Пример #23
0
 def generator(self, data_dir, tmp_dir, train):
   TRAIN_DATASET = self.get_training_dataset(tmp_dir)
   datasets = TRAIN_DATASET if train else _NC_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in TRAIN_DATASET]
   target_datasets = [[item[0], [item[1][1]]] for item in TRAIN_DATASET]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets, _file_byte_budget=1e8)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets, _file_byte_budget=1e8)
   tag = "train" if train else "dev"
   filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag)
   data_path = translate.compile_data(tmp_dir, datasets, filename_base)
   return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                              data_path + ".lang2",
                                              source_vocab, target_vocab, EOS)
Пример #24
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.source_vocab_name,
         self.approx_vocab_size,
         _ZHZH_TRAIN_DATASETS,
         file_byte_budget=1e8)
     train = dataset_split == problem.DatasetSplit.TRAIN
     datasets = _ZHZH_TRAIN_DATASETS if train else _ZHZH_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "mydata_enzh_tok_%s" % tag)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + ".lang1",
                                              data_path + ".lang2"),
         symbolizer_vocab, symbolizer_vocab)
Пример #25
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_dataset = self.get_training_dataset(tmp_dir)
        datasets = train_dataset if train else _NC_TEST_DATASETS
        for item in datasets:
            dummy_file_name = item[0].split("/")[-1]
            create_dummy_tar(tmp_dir, dummy_file_name)
            s_file, t_file = item[1][0], item[1][1]
            if not os.path.exists(os.path.join(tmp_dir, s_file)):
                raise Exception("Be sure file '%s' is exists in tmp dir" %
                                s_file)
            if not os.path.exists(os.path.join(tmp_dir, t_file)):
                raise Exception("Be sure file '%s' is exists in tmp dir" %
                                t_file)

        source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
        target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]

        # 有词汇表直接编码,没有此汇表会自己创建词表;此处构建的是编码器,同时可以创建词表
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            source_datasets,
            file_byte_budget=1e8)

        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            target_datasets,
            file_byte_budget=1e8)

        tag = "train" if train else "dev"
        filename_base = "wmt_enzh_%sk_sub_%s" % (self.approx_vocab_size, tag)
        # 将所有语料连接存入一个文件中
        data_path = translate.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2"),
            source_vocab, target_vocab)
Пример #26
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
   datasets = self.source_data_files(dataset_split)
   tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
   data_path = translate.compile_data(
       tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag))
   # For eval, use authentic data.
   if dataset_split != problem.DatasetSplit.TRAIN:
     for example in text_problems.text2text_txt_iterator(
         data_path + ".lang1", data_path + ".lang2"):
       yield example
   else:  # For training, mix synthetic and authentic data as follows.
     for (file1, file2) in self.backtranslate_data_filenames:
       path1 = os.path.join(tmp_dir, file1)
       path2 = os.path.join(tmp_dir, file2)
       # Synthetic data first.
       for example in text_problems.text2text_txt_iterator(path1, path2):
         yield example
       # Now authentic data.
       for example in text_problems.text2text_txt_iterator(
           data_path + ".lang1", data_path + ".lang2"):
         yield example
Пример #27
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]]
                        for item in _ZHEN_TRAIN_DATASETS]
     target_datasets = [[item[0], [item[1][1]]]
                        for item in _ZHEN_TRAIN_DATASETS]
     source_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.source_vocab_name,
         self.targeted_vocab_size, source_datasets)
     target_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.target_vocab_name,
         self.targeted_vocab_size, target_datasets)
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "wmt_zhen_tok_%s" % tag)
     # We generate English->X data by convention, to train reverse translation
     # just add the "_rev" suffix to the problem name, e.g., like this.
     #   --problems=translate_enzh_wmt8k_rev
     return translate.bi_vocabs_token_generator(data_path + ".lang2",
                                                data_path + ".lang1",
                                                source_vocab, target_vocab,
                                                EOS)
Пример #28
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
   datasets = self.source_data_files(dataset_split)
   tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
   data_path = translate.compile_data(
       tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag))
   # Iterator over authentic data.
   it_auth = text_problems.text2text_txt_iterator(
       data_path + ".lang1", data_path + ".lang2")
   # For eval, use authentic data.
   if dataset_split != problem.DatasetSplit.TRAIN:
     for example in it_auth:
       yield example
   else:  # For training, mix synthetic and authentic data as follows.
     for (file1, file2) in self.backtranslate_data_filenames:
       path1 = os.path.join(tmp_dir, file1)
       path2 = os.path.join(tmp_dir, file2)
       # Synthetic data first.
       for example in text_problems.text2text_txt_iterator(path1, path2):
         yield example
       # Now authentic data.
       for example in it_auth:
         yield example