Пример #1
0
 def generator(self, data_dir, tmp_dir, train):
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = train_dataset if train else _NC_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
     source_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.source_vocab_name,
         self.targeted_vocab_size,
         source_datasets,
         file_byte_budget=1e8)
     target_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.target_vocab_name,
         self.targeted_vocab_size,
         target_datasets,
         file_byte_budget=1e8)
     tag = "train" if train else "dev"
     filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                                data_path + ".lang2",
                                                source_vocab, target_vocab,
                                                EOS)
Пример #2
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _TRAIN_DATASETS if train else _TEST_DATASETS
     source_datasets = [[FLAGS.raw_data_dir, [item[0]]]
                        for item in datasets]
     target_datasets = [[FLAGS.raw_data_dir, [item[1]]]
                        for item in datasets]
     # Copy vocab to data directory
     source_vocab_path = os.path.join(data_dir, self.source_vocab_name)
     target_vocab_path = os.path.join(data_dir, self.target_vocab_name)
     if os.path.exists(source_vocab_path):
         os.remove(source_vocab_path)
     if os.path.exists(target_vocab_path):
         os.remove(target_vocab_path)
     copyVocab(os.path.join(FLAGS.raw_data_dir, _VOCABS[0]),
               source_vocab_path)
     copyVocab(os.path.join(FLAGS.raw_data_dir, _VOCABS[1]),
               target_vocab_path)
     source_token_vocab = text_encoder.TokenTextEncoder(source_vocab_path,
                                                        replace_oov="<unk>")
     target_token_vocab = text_encoder.TokenTextEncoder(target_vocab_path,
                                                        replace_oov="<unk>")
     tag = "train" if train else "dev"
     data_path = _compile_data(tmp_dir, datasets, "generic_tok_%s" % tag)
     return translate.bi_vocabs_token_generator(data_path + ".src",
                                                data_path + ".trg",
                                                source_token_vocab,
                                                target_token_vocab, EOS)
Пример #3
0
    def generator(self, data_dir, tmp_dir, train):
        source_vocab = generator_utils.get_local_vocab(
            data_dir, self.source_vocab_name())
        target_vocab = generator_utils.get_local_vocab(
            data_dir, self.targeted_vocab_size)
        tag = "train" if train else "dev"

        filename_base = "%s.%d" % (self.targeted_vocab_size, tag)
        """Concatenate all `datasets` and save to `filename`.   return tmp_dir/filename_base """

        data_path = os.path.join(data_dir, filename_base)
        """Generator for sequence-to-sequence tasks that uses tokens.

      This generator assumes the files at source_path and target_path have
      the same number of lines and yields dictionaries of "inputs" and "targets"
      where inputs are token ids from the " "-split source (and target, resp.) lines
      converted to integers using the token_map.

      Args:
        source_path: path to the file with source sentences.
        target_path: path to the file with target sentences.
        source_token_vocab: text_encoder.TextEncoder object.
        target_token_vocab: text_encoder.TextEncoder object.
        eos: integer to append at the end of each sequence (default: None).
      Yields:
        A dictionary {"inputs": source-line, "targets": target-line} where
        the lines are integer lists converted from tokens in the file lines.
      """
        return translate.bi_vocabs_token_generator(data_path + ".mn.shuf",
                                                   data_path + ".ch.shuf",
                                                   source_vocab, target_vocab,
                                                   EOS)
Пример #4
0
    def generator(self, data_dir, tmp_dir, train):
        tag = "train" if train else "dev"
        source_path = "{}/{}/".format(self.root_dir, tag)

        vocab_file = "{}/{}".format(self.root_dir, self.vocabulary_file)
        symbolizer_vocab = text_encoder.TokenTextEncoder(vocab_file,
                                                         replace_oov='<UNK>')
        mesh_vocab_filename = os.path.join(self.root_dir,
                                           self.mesh_vocabulary_file)
        mesh_vocab = text_encoder.TokenTextEncoder(mesh_vocab_filename,
                                                   replace_oov='<UNK>')

        return translate.bi_vocabs_token_generator(
            source_path + self.data_file, source_path + self.tag_file,
            symbolizer_vocab, mesh_vocab, EOS)
Пример #5
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS]
   target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets)
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enzh_tok_%s" % tag)
   return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                              data_path + ".lang2",
                                              source_vocab, target_vocab, EOS)
Пример #6
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS]
   target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets)
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enzh_tok_%s" % tag)
   return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                              data_path + ".lang2",
                                              source_vocab, target_vocab, EOS)
Пример #7
0
 def generator(self, data_dir, tmp_dir, train):
   TRAIN_DATASET = self.get_training_dataset(tmp_dir)
   datasets = TRAIN_DATASET if train else _NC_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in TRAIN_DATASET]
   target_datasets = [[item[0], [item[1][1]]] for item in TRAIN_DATASET]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets, _file_byte_budget=1e8)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets, _file_byte_budget=1e8)
   tag = "train" if train else "dev"
   filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag)
   data_path = translate.compile_data(tmp_dir, datasets, filename_base)
   return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                              data_path + ".lang2",
                                              source_vocab, target_vocab, EOS)
Пример #8
0
    def generator(self, data_dir, tmp_dir, train):
        datasets = self.get_datasets(train)

        # build vocab from training datasets
        source_datasets = [[item[0], [item[1][0]]] for item in self.get_datasets(train=True)]
        target_datasets = [[item[0], [item[1][1]]] for item in self.get_datasets(train=True)]
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
            source_datasets)
        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
            target_datasets)

        tag = "train" if train else "dev"
        data_path = compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
        return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
                                            source_vocab, target_vocab, EOS)
  def generator(self, data_dir, tmp_dir, train):
    datasets = _KOEN_TRAIN_DATASETS if train else _KOEN_TEST_DATASETS
    source_datasets = [item[0] for item in _KOEN_TRAIN_DATASETS]
    target_datasets = [item[1] for item in _KOEN_TRAIN_DATASETS]

    tf.gfile.MakeDirs(data_dir)

    source_vocab = get_or_generate_vocab(data_dir, tmp_dir, self.source_vocab_name, 
            self.sourced_vocab_size, source_datasets, mode=self.source_mode)
    target_vocab = get_or_generate_vocab(data_dir, tmp_dir, self.target_vocab_name,
            self.targeted_vocab_size, target_datasets, mode=self.target_mode)

    tag = "train" if train else "dev"
    data_path = get_or_compile_data(tmp_dir, datasets, "simple_koen_tok_%s" % tag)

    return translate.bi_vocabs_token_generator(data_path + ".lang1",
            data_path + ".lang2", source_vocab, target_vocab, text_encoder.EOS_ID)
Пример #10
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _TRAIN_DATASETS if train else _TEST_DATASETS
     source_datasets = [[FLAGS.raw_data_dir, [item[0]]]
                        for item in datasets]
     target_datasets = [[FLAGS.raw_data_dir, [item[1]]]
                        for item in datasets]
     source_vocab = generator_utils.get_or_generate_vocab_nocompress(
         data_dir, self.source_vocab_name, self.targeted_vocab_size,
         source_datasets)
     target_vocab = generator_utils.get_or_generate_vocab_nocompress(
         data_dir, self.target_vocab_name, self.targeted_vocab_size,
         target_datasets)
     tag = "train" if train else "dev"
     data_path = _compile_data(tmp_dir, datasets, "generic_tok_%s" % tag)
     return translate.bi_vocabs_token_generator(data_path + ".src",
                                                data_path + ".trg",
                                                source_vocab, target_vocab,
                                                EOS)
Пример #11
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]]
                        for item in _ZHEN_TRAIN_DATASETS]
     target_datasets = [[item[0], [item[1][1]]]
                        for item in _ZHEN_TRAIN_DATASETS]
     source_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.source_vocab_name,
         self.targeted_vocab_size, source_datasets)
     target_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.target_vocab_name,
         self.targeted_vocab_size, target_datasets)
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "wmt_zhen_tok_%s" % tag)
     # We generate English->X data by convention, to train reverse translation
     # just add the "_rev" suffix to the problem name, e.g., like this.
     #   --problems=translate_enzh_wmt8k_rev
     return translate.bi_vocabs_token_generator(data_path + ".lang2",
                                                data_path + ".lang1",
                                                source_vocab, target_vocab,
                                                EOS)
    def generator(self, data_dir, tmp_dir, train):
        datasets = _ENKR_SUBTITLE_TRAIN_DATASETS if train else _ENKR_SUBTITLE_TEST_DATASETS
        source_datasets = [
            os.path.join(data_dir, path[1][0]) for path in datasets
        ]
        target_datasets = [
            os.path.join(data_dir, path[1][1]) for path in datasets
        ]

        source_vocab = generator_utils.get_or_generate_txt_vocab(
            data_dir, self.source_vocab_name, self.targeted_vocab_size,
            source_datasets)
        target_vocab = generator_utils.get_or_generate_txt_vocab(
            data_dir, self.target_vocab_name, self.targeted_vocab_size,
            target_datasets)

        tag = "train" if train else "dev"
        data_path = compile_data_from_txt(tmp_dir, datasets,
                                          "zero_shot_enkrch_tok_%s" % tag)

        return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                                   data_path + ".lang2",
                                                   source_vocab, target_vocab,
                                                   EOS)