Exemplo n.º 1
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):

        train = dataset_split == problem.DatasetSplit.TRAIN

        source_datasets = _INPUT_FILES
        target_datasets = _OUTPUT_FILES

        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            source_datasets,
            file_byte_budget=1e8)

        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            target_datasets,
            file_byte_budget=1e8)

        tag = "train" if train else "test"

        filename_src = "en_{}.src".format(tag)
        filename_dst = "ru_{}.dst".format(tag)

        data_path = './shad_nlp18_contextNMT/data_fused/'

        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + filename_src,
                                                 data_path + filename_dst),
            source_vocab, target_vocab)
Exemplo n.º 2
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
   train = dataset_split == problem.DatasetSplit.TRAIN
   train_dataset = self.get_training_dataset(tmp_dir)
   datasets = train_dataset if train else _NC_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
   target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir,
       tmp_dir,
       self.source_vocab_name,
       self.approx_vocab_size,
       source_datasets,
       file_byte_budget=1e8)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir,
       tmp_dir,
       self.target_vocab_name,
       self.approx_vocab_size,
       target_datasets,
       file_byte_budget=1e8)
   tag = "train" if train else "dev"
   filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
   data_path = translate.compile_data(tmp_dir, datasets, filename_base)
   return text_problems.text2text_generate_encoded(
       text_problems.text2text_txt_iterator(data_path + ".lang1",
                                            data_path + ".lang2"),
       source_vocab, target_vocab)
Exemplo n.º 3
0
def parsing_token_generator(data_dir, tmp_dir, train, source_vocab_size,
                            target_vocab_size):
  """Generator for parsing as a sequence-to-sequence task that uses tokens.

  This generator assumes the files parsing_{train,dev}.trees, which contain
  trees in wsj format.

  Args:
    data_dir: path to the data directory.
    tmp_dir: path to temporary storage directory.
    train: whether we're training or not.
    source_vocab_size: source vocab size.
    target_vocab_size: target vocab size.

  Returns:
    A generator to a dictionary of inputs and outputs.
  """
  source_symbolizer_vocab = generator_utils.get_or_generate_vocab(
      data_dir, tmp_dir, "wsj_source.vocab.%d" % source_vocab_size,
      source_vocab_size)
  target_symbolizer_vocab = generator_utils.get_or_generate_vocab(
      data_dir, tmp_dir, "wsj_target.vocab.%d" % target_vocab_size,
      target_vocab_size)
  filename = "%s_%s.trees" % (FLAGS.parsing_path, "train" if train else "dev")
  tree_filepath = os.path.join(tmp_dir, filename)
  return token_generator(tree_filepath, source_symbolizer_vocab,
                         target_symbolizer_vocab, 1)
Exemplo n.º 4
0
def parsing_token_generator(tmp_dir, train, source_vocab_size,
                            target_vocab_size):
    """Generator for parsing as a sequence-to-sequence task that uses tokens.

  This generator assumes the files parsing_{train,dev}.trees, which contain
  trees in wsj format.

  Args:
    tmp_dir: path to the file with source sentences.
    train: path to the file with target sentences.
    source_vocab_size: source vocab size.
    target_vocab_size: target vocab size.

  Returns:
    A generator to a dictionary of inputs and outputs.
  """
    source_symbolizer_vocab = generator_utils.get_or_generate_vocab(
        tmp_dir, "wsj_source.tokens.vocab.%d" % source_vocab_size,
        source_vocab_size)
    target_symbolizer_vocab = generator_utils.get_or_generate_vocab(
        tmp_dir, "wsj_target.tokens.vocab.%d" % target_vocab_size,
        target_vocab_size)
    filename = "%s_%s.trees" % (FLAGS.parsing_path,
                                "train" if train else "dev")
    tree_filepath = os.path.join(tmp_dir, filename)
    return token_generator(tree_filepath, source_symbolizer_vocab,
                           target_symbolizer_vocab, 1)
Exemplo n.º 5
0
    def generator(self, data_dir, tmp_dir, train):
        #pdb.set_trace()

        source_vocab_size = self.targeted_vocab_size
        target_vocab_size = self.targeted_vocab_size

        #symbolizer_vocab = generator_utils.get_or_generate_vocab(data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size)

        source_datasets = [[
            "pronoun_enfr_train.lang1", ["pronoun_enfr_train.lang1"]
        ]]
        target_datasets = [[
            "pronoun_enfr_train.lang2", ["pronoun_enfr_train.lang2"]
        ]]

        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, "vocab.pronoun-en.%d" % source_vocab_size,
            source_vocab_size, source_datasets)
        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, "vocab.pronoun-fr.%d" % target_vocab_size,
            target_vocab_size, target_datasets)

        #datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
        #tag = "train" if train else "dev"
        #data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
        data_path = tmp_dir + "/pronoun_enfr_"
        if train:
            data_path = data_path + "train"
        else:
            data_path = data_path + "dev"

        return bi_vocabs_token_generator(data_path + ".lang2",
                                         data_path + ".lang1", source_vocab,
                                         target_vocab, EOS)
Exemplo n.º 6
0
def parsing_token_generator(tmp_dir, train, source_vocab_size,
                            target_vocab_size):
  """Generator for parsing as a sequence-to-sequence task that uses tokens.

  This generator assumes the files parsing_{train,dev}.wsj, which contain trees
  in wsj format and wsj_{source,target}.tokens.vocab.<vocab_size> exist in
  tmp_dir.

  Args:
    tmp_dir: path to the file with source sentences.
    train: path to the file with target sentences.
    source_vocab_size: source vocab size.
    target_vocab_size: target vocab size.

  Returns:
    A generator to a dictionary of inputs and outputs.
  """
  source_symbolizer_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "wsj_source.tokens.vocab.%d" % source_vocab_size,
      source_vocab_size)
  target_symbolizer_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "wsj_target.tokens.vocab.%d" % target_vocab_size,
      target_vocab_size)
  filename = "parsing_%s.trees" % ("train" if train else "dev")
  tree_filepath = os.path.join(tmp_dir, filename)
  return token_generator(tree_filepath, source_symbolizer_vocab,
                         target_symbolizer_vocab, 1)
Exemplo n.º 7
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     train = dataset_split == problem.DatasetSplit.TRAIN
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = train_dataset if train else _NC_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
     source_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.source_vocab_name,
         self.approx_vocab_size,
         source_datasets,
         file_byte_budget=1e8,
         max_subtoken_length=self.max_subtoken_length)
     target_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.target_vocab_name,
         self.approx_vocab_size,
         target_datasets,
         file_byte_budget=1e8,
         max_subtoken_length=self.max_subtoken_length)
     tag = "train" if train else "dev"
     filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + ".lang1",
                                              data_path + ".lang2"),
         source_vocab, target_vocab)
Exemplo n.º 8
0
 def generator(self, data_dir, tmp_dir, train):
     train_dataset = self.get_training_dataset(tmp_dir)
     datasets = train_dataset if train else _NC_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
     target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
     source_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.source_vocab_name,
         self.targeted_vocab_size,
         source_datasets,
         file_byte_budget=1e8)
     target_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.target_vocab_name,
         self.targeted_vocab_size,
         target_datasets,
         file_byte_budget=1e8)
     tag = "train" if train else "dev"
     filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
     return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                                data_path + ".lang2",
                                                source_vocab, target_vocab,
                                                EOS)
Exemplo n.º 9
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    datasets = self.source_data_files(dataset_split)
    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
    data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name,
                                                                    tag))

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                data_path + ".lang2")
Exemplo n.º 10
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    datasets = self.source_data_files(dataset_split)
    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
    data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name,
                                                                    tag))

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                data_path + ".lang2")
Exemplo n.º 11
0
def parsing_token_generator(tmp_dir, train, vocab_size):
    symbolizer_vocab = generator_utils.get_or_generate_vocab(
        tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size)
    filename = "parsing_%s" % ("train" if train else "dev")
    text_filepath = os.path.join(tmp_dir, filename + ".text")
    tags_filepath = os.path.join(tmp_dir, filename + ".tags")
    return token_generator(text_filepath, tags_filepath, symbolizer_vocab, EOS)
Exemplo n.º 12
0
def parsing_token_generator(tmp_dir, train, vocab_size):
  symbolizer_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size)
  filename = "parsing_%s" % ("train" if train else "dev")
  text_filepath = os.path.join(tmp_dir, filename + ".text")
  tags_filepath = os.path.join(tmp_dir, filename + ".tags")
  return token_generator(text_filepath, tags_filepath, symbolizer_vocab, 1)
Exemplo n.º 13
0
def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
  symbolizer_vocab = generator_utils.get_or_generate_vocab(
      data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size)
  filename = "%s_%s.trees" % (FLAGS.parsing_path, "train" if train else "dev")
  tree_filepath = os.path.join(tmp_dir, filename)
  return wsj_parsing.token_generator(tree_filepath, symbolizer_vocab,
                                     symbolizer_vocab, EOS)
Exemplo n.º 14
0
 def generator(self, data_dir, tmp_dir, train):
   source_vocab_size = self.targeted_vocab_size
   target_vocab_size = self.targeted_vocab_size
   datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS]
   target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, "vocab.zhen-zh.%d" % source_vocab_size,
       source_vocab_size, source_datasets)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, "vocab.zhen-en.%d" % target_vocab_size,
       target_vocab_size, target_datasets)
   tag = "train" if train else "dev"
   data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
   return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
                                    source_vocab, target_vocab, EOS)
Exemplo n.º 15
0
 def generator(self, data_dir, tmp_dir, train):
     vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, [_TRAIN_DATASETS["sv"]])
     datasets = _TRAIN_DATASETS["sv"] if train else _TEST_DATASETS["sv"]
     fulltext_file, summaries_file = download_and_extract_data(
         tmp_dir, datasets)
     return token_generator(fulltext_file, summaries_file, vocab, EOS)
Exemplo n.º 16
0
def zhen_wordpiece_token_generator(tmp_dir, train, source_vocab_size,
                                   target_vocab_size):
  """Wordpiece generator for the WMT'17 zh-en dataset."""
  datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
  source_datasets = [[item[0], [item[1][0]]] for item in datasets]
  target_datasets = [[item[0], [item[1][1]]] for item in datasets]
  source_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "tokens.vocab.zh.%d" % source_vocab_size, source_vocab_size,
      source_datasets)
  target_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "tokens.vocab.en.%d" % target_vocab_size, target_vocab_size,
      target_datasets)
  tag = "train" if train else "dev"
  data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
  return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
                                   source_vocab, target_vocab, EOS)
Exemplo n.º 17
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS]
   target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets)
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enzh_tok_%s" % tag)
   return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                              data_path + ".lang2",
                                              source_vocab, target_vocab, EOS)
Exemplo n.º 18
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS]
   target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets)
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enzh_tok_%s" % tag)
   return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                              data_path + ".lang2",
                                              source_vocab, target_vocab, EOS)
Exemplo n.º 19
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
     tag = "train" if train else "dev"
     vocab_datasets = []
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "czeng57m_encs_tok_%s" % tag)
     # CzEng contains 100 gz files with tab-separated columns, so let's expect
     # it is the first dataset in datasets and use the newly created *.lang{1,2}
     # files for vocab construction.
     if datasets[0][0].endswith("czeng57m.tar"):
         vocab_datasets.append([
             datasets[0][0],
             [
                 "czeng57m_encs_tok_%s.lang1" % tag,
                 "czeng57m_encs_tok_%s.lang2" % tag
             ]
         ])
         datasets = datasets[1:]
     vocab_datasets += [[item[0], [item[1][0], item[1][1]]]
                        for item in datasets]
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         vocab_datasets)
     return translate.token_generator(data_path + ".lang1",
                                      data_path + ".lang2",
                                      symbolizer_vocab, text_encoder.EOS_ID)
Exemplo n.º 20
0
def timit_generator(data_dir,
                    tmp_dir,
                    training,
                    how_many,
                    start_from=0,
                    eos_list=None,
                    vocab_filename=None,
                    vocab_size=0):
  """Data generator for TIMIT transcription problem.

  Args:
    data_dir: path to the data directory.
    tmp_dir: path to temporary storage directory.
    training: a Boolean; if true, we use the train set, otherwise the test set.
    how_many: how many inputs and labels to generate.
    start_from: from which input to start.
    eos_list: optional list of end of sentence tokens, otherwise use default
      value `1`.
    vocab_filename: file within `tmp_dir` to read vocabulary from. If this is
      not provided then the target sentence will be encoded by character.
    vocab_size: integer target to generate vocabulary size to.

  Yields:
    A dictionary representing the images with the following fields:
    * inputs: a float sequence containing the audio data
    * audio/channel_count: an integer
    * audio/sample_count: an integer
    * audio/sample_width: an integer
    * targets: an integer sequence representing the encoded sentence
  """
  eos_list = [1] if eos_list is None else eos_list
  if vocab_filename is not None:
    vocab_symbolizer = generator_utils.get_or_generate_vocab(
        data_dir, tmp_dir, vocab_filename, vocab_size)
  _get_timit(tmp_dir)
  datasets = (_TIMIT_TRAIN_DATASETS if training else _TIMIT_TEST_DATASETS)
  i = 0
  for data_dir, (audio_ext, transcription_ext) in datasets:
    data_dir = os.path.join(tmp_dir, data_dir)
    data_files = _collect_data(data_dir, audio_ext, transcription_ext)
    data_pairs = data_files.values()
    for input_file, target_file in sorted(data_pairs)[start_from:]:
      if i == how_many:
        return
      i += 1
      audio_data, sample_count, sample_width, num_channels = _get_audio_data(
          input_file)
      text_data = _get_text_data(target_file)
      if vocab_filename is None:
        label = [ord(c) for c in text_data] + eos_list
      else:
        label = vocab_symbolizer.encode(text_data) + eos_list
      yield {
          "inputs": audio_data,
          "audio/channel_count": [num_channels],
          "audio/sample_count": [sample_count],
          "audio/sample_width": [sample_width],
          "targets": label
      }
Exemplo n.º 21
0
def timit_generator(data_dir,
                    tmp_dir,
                    training,
                    how_many,
                    start_from=0,
                    eos_list=None,
                    vocab_filename=None,
                    vocab_size=0):
    """Data generator for TIMIT transcription problem.

  Args:
    data_dir: path to the data directory.
    tmp_dir: path to temporary storage directory.
    training: a Boolean; if true, we use the train set, otherwise the test set.
    how_many: how many inputs and labels to generate.
    start_from: from which input to start.
    eos_list: optional list of end of sentence tokens, otherwise use default
      value `1`.
    vocab_filename: file within `tmp_dir` to read vocabulary from. If this is
      not provided then the target sentence will be encoded by character.
    vocab_size: integer target to generate vocabulary size to.

  Yields:
    A dictionary representing the images with the following fields:
    * inputs: a float sequence containing the audio data
    * audio/channel_count: an integer
    * audio/sample_count: an integer
    * audio/sample_width: an integer
    * targets: an integer sequence representing the encoded sentence
  """
    eos_list = [1] if eos_list is None else eos_list
    if vocab_filename is not None:
        vocab_symbolizer = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, vocab_filename, vocab_size)
    _get_timit(tmp_dir)
    datasets = (_TIMIT_TRAIN_DATASETS if training else _TIMIT_TEST_DATASETS)
    i = 0
    for data_dir, (audio_ext, transcription_ext) in datasets:
        data_dir = os.path.join(tmp_dir, data_dir)
        data_files = _collect_data(data_dir, audio_ext, transcription_ext)
        data_pairs = data_files.values()
        for input_file, target_file in sorted(data_pairs)[start_from:]:
            if i == how_many:
                return
            i += 1
            audio_data, sample_count, sample_width, num_channels = _get_audio_data(
                input_file)
            text_data = _get_text_data(target_file)
            if vocab_filename is None:
                label = [ord(c) for c in text_data] + eos_list
            else:
                label = vocab_symbolizer.encode(text_data) + eos_list
            yield {
                "inputs": audio_data,
                "audio/channel_count": [num_channels],
                "audio/sample_count": [sample_count],
                "audio/sample_width": [sample_width],
                "targets": label
            }
Exemplo n.º 22
0
 def generator(self, data_dir, tmp_dir, train):
   TRAIN_DATASET = self.get_training_dataset(tmp_dir)
   datasets = TRAIN_DATASET if train else _NC_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in TRAIN_DATASET]
   target_datasets = [[item[0], [item[1][1]]] for item in TRAIN_DATASET]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets, _file_byte_budget=1e8)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets, _file_byte_budget=1e8)
   tag = "train" if train else "dev"
   filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag)
   data_path = translate.compile_data(tmp_dir, datasets, filename_base)
   return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                              data_path + ".lang2",
                                              source_vocab, target_vocab, EOS)
Exemplo n.º 23
0
def ende_wordpiece_token_generator(tmp_dir, train, vocab_size):
  symbolizer_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size)
  datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
  tag = "train" if train else "dev"
  data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
  return token_generator(data_path + ".lang1", data_path + ".lang2",
                         symbolizer_vocab, EOS)
Exemplo n.º 24
0
 def generator(self, data_dir, tmp_dir, train):
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size)
   datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
   tag = "train" if train else "dev"
   data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag)
   return token_generator(data_path + ".lang1", data_path + ".lang2",
                          symbolizer_vocab, EOS)
Exemplo n.º 25
0
def ende_wordpiece_token_generator(tmp_dir, train, vocab_size):
  symbolizer_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size)
  datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
  tag = "train" if train else "dev"
  data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag)
  return token_generator(data_path + ".lang1", data_path + ".lang2",
                         symbolizer_vocab, 1)
Exemplo n.º 26
0
    def generator(self, data_dir, tmp_dir, train):
        datasets = self.get_datasets(train)

        # build vocab from training datasets
        source_datasets = [[item[0], [item[1][0]]] for item in self.get_datasets(train=True)]
        target_datasets = [[item[0], [item[1][1]]] for item in self.get_datasets(train=True)]
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
            source_datasets)
        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
            target_datasets)

        tag = "train" if train else "dev"
        data_path = compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
        return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
                                            source_vocab, target_vocab, EOS)
 def generate_vocab(self, data_dir, tmp_dir, **kwargs):
     datasets = get_dataset(tmp_dir)
     source_datasets = [[item[0], [item[1][0]]] for item in datasets]
     target_datasets = [[item[0], [item[1][1]]] for item in datasets]
     _ = generator_utils.get_or_generate_vocab(data_dir,
                                               tmp_dir,
                                               self.source_vocab_name,
                                               self.approx_vocab_size,
                                               source_datasets,
                                               file_byte_budget=1e8)
     _ = generator_utils.get_or_generate_vocab(data_dir,
                                               tmp_dir,
                                               self.target_vocab_name,
                                               int(self.approx_vocab_size /
                                                   2),
                                               target_datasets,
                                               file_byte_budget=1e8)
Exemplo n.º 28
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS]
   target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS]
   source_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
       source_datasets)
   target_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
       target_datasets)
   tag = "train" if train else "dev"
   data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
   # We generate English->X data by convention, to train reverse translation
   # just add the "_rev" suffix to the problem name, e.g., like this.
   #   --problems=translate_enzh_wmt8k_rev
   return bi_vocabs_token_generator(data_path + ".lang2", data_path + ".lang1",
                                    source_vocab, target_vocab, EOS)
Exemplo n.º 29
0
def enfr_wordpiece_token_generator(tmp_dir, train, vocab_size):
  """Instance of token generator for the WMT en->fr task."""
  symbolizer_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size)
  datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
  tag = "train" if train else "dev"
  data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag)
  return token_generator(data_path + ".lang1", data_path + ".lang2",
                         symbolizer_vocab, EOS)
Exemplo n.º 30
0
def enfr_wordpiece_token_generator(tmp_dir, train, vocab_size):
  """Instance of token generator for the WMT en->fr task."""
  symbolizer_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size)
  datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS
  tag = "train" if train else "dev"
  data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag)
  return token_generator(data_path + ".lang1", data_path + ".lang2",
                         symbolizer_vocab, 1)
Exemplo n.º 31
0
 def generator(self, data_dir, tmp_dir, train):
     vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         [_TRAIN_DATASETS["facts-result"]])
     datasets = _TRAIN_DATASETS[
         "facts-result"] if train else _TEST_DATASETS["facts-result"]
     document_file, labels_file = download_and_extract_data(
         tmp_dir, datasets)
     return token_generator(document_file, labels_file, vocab, EOS)
Exemplo n.º 32
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        train = dataset_split == problem.DatasetSplit.TRAIN
        train_dataset = self.get_training_dataset(tmp_dir)
        datasets = train_dataset if train else _NC_TEST_DATASETS
        for item in datasets:
            dummy_file_name = item[0].split("/")[-1]
            create_dummy_tar(tmp_dir, dummy_file_name)
            s_file, t_file = item[1][0], item[1][1]
            if not os.path.exists(os.path.join(tmp_dir, s_file)):
                raise Exception("Be sure file '%s' is exists in tmp dir" %
                                s_file)
            if not os.path.exists(os.path.join(tmp_dir, t_file)):
                raise Exception("Be sure file '%s' is exists in tmp dir" %
                                t_file)

        source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
        target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]

        # 有词汇表直接编码,没有此汇表会自己创建词表;此处构建的是编码器,同时可以创建词表
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            source_datasets,
            file_byte_budget=1e8)

        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir,
            tmp_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            target_datasets,
            file_byte_budget=1e8)

        tag = "train" if train else "dev"
        filename_base = "wmt_enzh_%sk_sub_%s" % (self.approx_vocab_size, tag)
        # 将所有语料连接存入一个文件中
        data_path = translate.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2"),
            source_vocab, target_vocab)
Exemplo n.º 33
0
 def generator(self, data_dir, tmp_dir, train):
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       _ENDE_TRAIN_DATASETS)
   datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_ende_tok_%s" % tag)
   return translate.token_generator(data_path + ".lang1", data_path + ".lang2",
                                    symbolizer_vocab, EOS)
Exemplo n.º 34
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file)
Exemplo n.º 35
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file)
Exemplo n.º 36
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in datasets]
   target_datasets = [[item[0], [item[1][1]]] for item in datasets]
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       source_datasets + target_datasets)
   tag = "train" if train else "dev"
   data_path = _compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag)
   return token_generator(data_path + ".lang1", data_path + ".lang2",
                          symbolizer_vocab, EOS)
Exemplo n.º 37
0
 def generator(self, data_dir, tmp_dir, train):
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         _TRAIN_DATASETS["en-sv"])
     datasets = _TRAIN_DATASETS["en-sv"] if train else _TEST_DATASETS[
         "en-sv"]
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "legal_ensv_tok_%s" % tag)
     return translate.token_generator(data_path + ".lang1",
                                      data_path + ".lang2",
                                      symbolizer_vocab, EOS)
Exemplo n.º 38
0
def mken_wordpiece_token_generator(tmp_dir, train, vocab_size):
  """Wordpiece generator for the SETimes Mk-En dataset."""
  datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
  source_datasets = [[item[0], [item[1][0]]] for item in datasets]
  target_datasets = [[item[0], [item[1][1]]] for item in datasets]
  symbolizer_vocab = generator_utils.get_or_generate_vocab(
      tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size,
      source_datasets + target_datasets)
  tag = "train" if train else "dev"
  data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
  return token_generator(data_path + ".lang1", data_path + ".lang2",
                         symbolizer_vocab, EOS)
Exemplo n.º 39
0
 def generator(self, data_dir, tmp_dir, train):
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       _ENFR_TRAIN_SMALL_DATA)
   if self.use_small_dataset:
     datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA
   else:
     datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enfr_tok_%s" % tag)
   return translate.token_generator(data_path + ".lang1", data_path + ".lang2",
                                    symbolizer_vocab, EOS)
Exemplo n.º 40
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in datasets]
   target_datasets = [[item[0], [item[1][1]]] for item in datasets]
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       source_datasets + target_datasets)
   tag = "train" if train else "dev"
   data_path = translate._compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
   # We generate English->X data by convention, to train reverse translation
   # just add the "_rev" suffix to the problem name, e.g., like this.
   #   --problems=translate_enmk_setimes32k_rev
   return translate.token_generator(data_path + ".lang2", data_path + ".lang1",
                          symbolizer_vocab, EOS)
Exemplo n.º 41
0
 def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir,
         tmp_dir,
         self.source_vocab_name,
         self.approx_vocab_size,
         _ZHZH_TRAIN_DATASETS,
         file_byte_budget=1e8)
     train = dataset_split == problem.DatasetSplit.TRAIN
     datasets = _ZHZH_TRAIN_DATASETS if train else _ZHZH_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "mydata_enzh_tok_%s" % tag)
     return text_problems.text2text_generate_encoded(
         text_problems.text2text_txt_iterator(data_path + ".lang1",
                                              data_path + ".lang2"),
         symbolizer_vocab, symbolizer_vocab)
Exemplo n.º 42
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = _compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag)
     vocab_datasets = []
     # CzEng contains 100 gz files with tab-separated columns, so let's expect
     # it is the first dataset in datasets and use the newly created *.lang{1,2} files instead.
     if datasets[0][0].endswith("data-plaintext-format.tar"):
         vocab_datasets.append([
             datasets[0][0],
             ["wmt_encs_tok_%s.lang1" % tag,
              "wmt_encs_tok_%s.lang2" % tag]
         ])
         datasets = datasets[1:]
     vocab_datasets += [[item[0], [item[1][0], item[1][1]]]
                        for item in datasets]
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         vocab_datasets)
     return token_generator(data_path + ".lang1", data_path + ".lang2",
                            symbolizer_vocab, EOS)
Exemplo n.º 43
0
  def generator(self, data_dir, tmp_dir, train):
    # Called twice: for train and test

    # Get the list of the training samples (coding challenge samples)
    samples = list(generator_samples(tmp_dir, self.pb_constants))

    # Split between train and dev
    # Suffle to get problems from diverse sources (CodeChef and CodeForces) and
    # dificulties in each set.
    # Need to sort the samples first before shuffling (as walk() isn't
    # deterministic)
    samples.sort(key=lambda x: x.desc_file)  # in-place
    rng = random.Random(7531)  # Local fixed seed
    rng.shuffle(samples)  # in-place

    # Train: 5019/5228 problems
    # Dev: 209/5228 problems
    len_samples = len(samples)
    split = len_samples // 25
    samples = samples[split:] if train else samples[:split]
    tf.logging.info("Number of samples for {}: {}/{}".format(
        "train" if train else "dev",
        len(samples),
        len_samples
    ))

    def generator_samples_content(get_source, get_target):
      source, target = None, None
      # Iterate over the coding samples
      for sample in samples:
        if get_source:
          with tf.gfile.GFile(sample.desc_file, mode="r") as source_file:
            source = source_file.read()

        if get_target:
          # Each challenge can have multiple implementations (or none)
          for code_file in sample.code_files:
            with tf.gfile.GFile(code_file, mode="r") as target_file:
              target = target_file.read()
              target = self.preprocess_target(target)
            yield source, target
        elif sample.code_files:  # Only take the source if a target exists
          yield source, target

    def generator_target():
      for _, target in generator_samples_content(False, True):
        yield target.strip()

    # Generate vocab for both source and target

    source_vocab = generator_utils.get_or_generate_vocab(
        data_dir, tmp_dir, self.vocab_input_filename, self.input_vocab_size)

    target_vocab = generator_utils.get_or_generate_vocab_inner(
        data_dir=data_dir,
        vocab_filename=self.vocab_target_filename,
        vocab_size=self.target_vocab_size,
        generator=generator_target(),)

    # Yield the training and testing samples
    eos_list = [EOS]
    for source, target in generator_samples_content(True, True):
      source_ints = source_vocab.encode(source.strip()) + eos_list
      target_ints = target_vocab.encode(target.strip()) + eos_list
      yield {
          "inputs": source_ints,
          "targets": target_ints,
      }
Exemplo n.º 44
0
def mscoco_generator(data_dir,
                     tmp_dir,
                     training,
                     how_many,
                     start_from=0,
                     eos_list=None,
                     vocab_filename=None,
                     vocab_size=0):
  """Image generator for MSCOCO captioning problem with token-wise captions.

  Args:
    data_dir: path to the data directory.
    tmp_dir: path to temporary storage directory.
    training: a Boolean; if true, we use the train set, otherwise the test set.
    how_many: how many images and labels to generate.
    start_from: from which image to start.
    eos_list: optional list of end of sentence tokens, otherwise use default
      value `1`.
    vocab_filename: file within `tmp_dir` to read vocabulary from.
    vocab_size: integer target to generate vocabulary size to.

  Yields:
    A dictionary representing the images with the following fields:
    * image/encoded: the string encoding the image as JPEG,
    * image/format: the string "jpeg" representing image format,
    * image/class/label: a list of integers representing the caption,
    * image/height: an integer representing the height,
    * image/width: an integer representing the width.
    Every field is actually a list of the corresponding type.
  """
  eos_list = [1] if eos_list is None else eos_list
  if vocab_filename is not None:
    vocab_symbolizer = generator_utils.get_or_generate_vocab(
        data_dir, tmp_dir, vocab_filename, vocab_size)
  _get_mscoco(tmp_dir)
  caption_filepath = (
      _MSCOCO_TRAIN_CAPTION_FILE if training else _MSCOCO_EVAL_CAPTION_FILE)
  caption_filepath = os.path.join(tmp_dir, caption_filepath)
  prefix = _MSCOCO_TRAIN_PREFIX if training else _MSCOCO_EVAL_PREFIX
  caption_file = io.open(caption_filepath)
  caption_json = json.load(caption_file)
  # Dictionary from image_id to ((filename, height, width), captions).
  image_dict = dict()
  for image in caption_json["images"]:
    image_dict[image["id"]] = [(image["file_name"], image["height"],
                                image["width"]), []]
  annotations = caption_json["annotations"]
  annotation_count = len(annotations)
  image_count = len(image_dict)
  tf.logging.info("Processing %d images and %d labels\n" % (image_count,
                                                            annotation_count))
  for annotation in annotations:
    image_id = annotation["image_id"]
    image_dict[image_id][1].append(annotation["caption"])

  data = list(image_dict.values())[start_from:start_from + how_many]
  random.shuffle(data)
  for image_info, labels in data:
    image_filename = image_info[0]
    image_filepath = os.path.join(tmp_dir, prefix, image_filename)
    with tf.gfile.Open(image_filepath, "r") as f:
      encoded_image_data = f.read()
      height, width = image_info[1], image_info[2]
      for label in labels:
        if vocab_filename is None:
          label = [ord(c) for c in label] + eos_list
        else:
          label = vocab_symbolizer.encode(label) + eos_list
        yield {
            "image/encoded": [encoded_image_data],
            "image/format": ["jpeg"],
            "image/class/label": label,
            "image/height": [height],
            "image/width": [width]
        }