Пример #1
0
def make_vocal_file():
    aaer = aaer_corpus.AAERParserTokens()
    generator_utils.get_or_generate_vocab_inner(
        data_dir=const.T2T_DATA_DIR,
        vocab_filename=const.T2T_AAER_VOLCAB_NAME,
        vocab_size=40000,
        generator=aaer.get_tokens())
Пример #2
0
    def generator(self, data_dir, tmp_dir, train):
        """Generate examples."""
        # Download and extract
        compressed_filename = os.path.basename(self.URL)
        download_path = generator_utils.maybe_download(tmp_dir,
                                                       compressed_filename,
                                                       self.URL)
        imdb_dir = os.path.join(tmp_dir, "aclImdb")
        if not tf.gfile.Exists(imdb_dir):
            with tarfile.open(download_path, "r:gz") as tar:
                tar.extractall(tmp_dir)

        # Generate vocab
        encoder = generator_utils.get_or_generate_vocab_inner(
            data_dir, self.vocab_file, self.targeted_vocab_size,
            lambda: self.doc_generator(imdb_dir, "train"))

        # Generate examples
        dataset = "train" if train else "test"
        for doc, label in self.doc_generator(imdb_dir,
                                             dataset,
                                             include_label=True):
            yield {
                "inputs": encoder.encode(doc) + [EOS],
                "targets": [int(label)],
            }
Пример #3
0
    def generator(self, data_dir, tmp_dir, train):
        """Generate examples."""
        data_file = TRAIN_DATASETS if train else TEST_DATASETS

        # Generate vocab
        raw_gen = RawDataGenerator()

        vocab_encoder = generator_utils.get_or_generate_vocab_inner(
            data_dir, self.vocab_file, self.targeted_vocab_size,
            raw_gen.generator(data_file, for_vocab=True))
        label_encoder = text_encoder.ClassLabelEncoder(
            class_labels_fname=LABEL_FILE)

        # Generate examples
        for label, entities, sentence in raw_gen.generator(data_file):
            entities = [vocab_encoder.encode(e) for e in entities]
            sentence = vocab_encoder.encode(sentence)

            entities_pos = raw_gen.find_start_position(entities, sentence)

            yield {
                "inputs": sentence,
                "targets": [label_encoder.encode(label)],
                'lexical': raw_gen.lexical_feature(entities_pos, sentence),
                'position1': raw_gen.position_feature(entities_pos[0],
                                                      sentence),
                'position2': raw_gen.position_feature(entities_pos[1],
                                                      sentence),
            }
Пример #4
0
 def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
   if self.vocab_type == VocabType.CHARACTER:
     encoder = text_encoder.ByteTextEncoder()
   elif self.vocab_type == VocabType.SUBWORD:
     if force_get:
       vocab_filepath = os.path.join(data_dir, self.vocab_filename)
       encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
     else:
       other_problem = self.use_vocab_from_other_problem
       if other_problem:
         return other_problem.get_or_create_vocab(data_dir, tmp_dir, force_get)
       encoder = generator_utils.get_or_generate_vocab_inner(
           data_dir, self.vocab_filename, self.approx_vocab_size,
           self.generate_text_for_vocab(data_dir, tmp_dir),
           max_subtoken_length=self.max_subtoken_length,
           reserved_tokens=(
               text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens))
   elif self.vocab_type == VocabType.TOKEN:
     vocab_filename = os.path.join(data_dir, self.vocab_filename)
     encoder = text_encoder.TokenTextEncoder(vocab_filename,
                                             replace_oov=self.oov_token)
   else:
     raise ValueError(
         "Unrecognized VocabType: %s" % str(self.vocab_type))
   return encoder
Пример #5
0
    def get_or_generate_vocab(self, data_dir, tmp_dir=None):

        filename_base = "nst_enzh_%sk_tok_%s" % (self.approx_vocab_size,
                                                 "train")
        """collect training Files to be passed to generate vocab(src)"""
        src_file_name = self.compile_data(tmp_dir,
                                          "train",
                                          filename_base + ".lang1",
                                          file_ext="trans.txt")
        """collect training Files to be passed to generate vocab(trg),(*pinyin.txt or *tmp.txt)"""
        trg_file_name = self.compile_data(tmp_dir,
                                          "train",
                                          filename_base + ".lang2",
                                          file_ext="pinyin.txt")

        def traverse_samples(filepath, file_byte_budget):
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                file_byte_budget_ = file_byte_budget
                counter = 0
                countermax = int(source_file.size() / file_byte_budget_ / 2)
                for line in source_file:
                    if counter < countermax:
                        counter += 1
                    else:
                        if file_byte_budget_ <= 0:
                            break
                        line = line.strip()
                        file_byte_budget_ -= len(line)
                        counter = 0
                        yield line

        src_vocab = generator_utils.get_or_generate_vocab_inner(
            data_dir,
            self.source_vocab_name,
            self.approx_vocab_size,
            traverse_samples(src_file_name, file_byte_budget=1e8),
        )
        trg_vocab = generator_utils.get_or_generate_vocab_inner(
            data_dir,
            self.target_vocab_name,
            self.approx_vocab_size,
            traverse_samples(trg_file_name, file_byte_budget=1e8),
        )

        return src_vocab, trg_vocab
Пример #6
0
 def feature_encoders(self, data_dir):
     vocab_filename = os.path.join(data_dir, self.vocab_file)
     # encoder = text_encoder.TokenTextEncoder(vocab_filename, replace_oov="UNK")
     encoder = generator_utils.get_or_generate_vocab_inner(
         data_dir=const.T2T_DATA_DIR,
         vocab_filename=vocab_filename,
         vocab_size=self.targeted_vocab_size,
         generator=aaer.AAERExParserTokens().get_tokens())
     return {"inputs": encoder, "targets": encoder}
Пример #7
0
 def generator(self, data_dir, tmp_dir, _):
     encoder = generator_utils.get_or_generate_vocab_inner(
         data_dir, self.vocab_file, self.targeted_vocab_size,
         lambda: story_generator(tmp_dir))
     for story in story_generator(tmp_dir):
         summary, rest = _story_summary_split(story)
         encoded_summary = encoder.encode(summary) + [EOS]
         encoded_story = encoder.encode(rest) + [EOS]
         yield {"inputs": encoded_story, "targets": encoded_summary}
Пример #8
0
 def generator(self, data_dir, tmp_dir, _):
   encoder = generator_utils.get_or_generate_vocab_inner(
       data_dir, self.vocab_file, self.targeted_vocab_size,
       page_generator(tmp_dir, max_docs=10000))
   for page in page_generator(tmp_dir):
     title = _page_title(page)
     encoded = encoder.encode(page) + [EOS]
     encoded_title = encoder.encode(title) + [EOS]
     yield {"inputs": encoded_title, "targets": encoded}
Пример #9
0
 def generator(self, data_dir, tmp_dir, _):
   encoder = generator_utils.get_or_generate_vocab_inner(
       data_dir, self.vocab_file, self.targeted_vocab_size,
       lambda: page_generator(tmp_dir, max_docs=10000))
   for page in page_generator(tmp_dir):
     title = _page_title(page)
     encoded = encoder.encode(page) + [EOS]
     encoded_title = encoder.encode(title) + [EOS]
     yield {"inputs": encoded_title, "targets": encoded}
Пример #10
0
 def generate_vocab(self):
     # Generate vocab
     token_generator = self.raw_gen.generator([TRAIN_FILE, TEST_FILE],
                                              for_vocab=True)
     self.vocab_encoder = generator_utils.get_or_generate_vocab_inner(
         OUTPUT_DIR, VOCAB_FILE, VOCAB_SIZE, token_generator)
     tf.logging.info('vocab_size', self.vocab_encoder.vocab_size)
     self.label_encoder = text_encoder.ClassLabelEncoder(
         class_labels_fname=LABEL_FILE)
Пример #11
0
 def generator(self, data_dir, tmp_dir, is_training):
     encoder = generator_utils.get_or_generate_vocab_inner(
         data_dir, self.vocab_file, self.targeted_vocab_size,
         example_generator(tmp_dir, is_training, sum_token=False))
     for example in example_generator(tmp_dir, is_training, sum_token=True):
         story, summary = _story_summary_split(example)
         encoded_summary = encoder.encode(summary) + [EOS]
         encoded_story = encoder.encode(story) + [EOS]
         yield {"inputs": encoded_story, "targets": encoded_summary}
Пример #12
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        # train_dataset = self.get_training_dataset(tmp_dir)
        if dataset_split == problem.DatasetSplit.TRAIN:
            datasets = LM_TRAIN_DATASETS
            tag = "train"
        elif dataset_split == problem.DatasetSplit.EVAL:
            datasets = LM_DEV_DATASETS
            tag = "dev"
        else:
            datasets = LM_TEST_DATASETS
            tag = "test"

        # train = dataset_split == problem.DatasetSplit.TRAIN

        # datasets = train_dataset if train else LM_TEST_DATASETS
        # source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
        # target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
        source_vocab = generator_utils.get_or_generate_vocab_inner(
            data_dir=data_dir,
            vocab_filename=self.source_vocab_name,
            vocab_size=self.approx_vocab_size,
            generator=self.generate(tmp_dir=tmp_dir,
                                    source_filenames=self.source_filenames,
                                    index=1),
            max_subtoken_length=None)
        target_vocab = generator_utils.get_or_generate_vocab_inner(
            data_dir=data_dir,
            vocab_filename=self.target_vocab_name,
            vocab_size=self.approx_vocab_size,
            generator=self.generate(tmp_dir=tmp_dir,
                                    source_filenames=self.source_filenames,
                                    index=2),
            max_subtoken_length=1)
        # tag = "train" if train else "dev"
        filename_base = "thchs_pinyinzh_%sk_tok_%s" % (self.approx_vocab_size,
                                                       tag)
        data_path = self.compile_data(tmp_dir, datasets, filename_base)
        return text_problems.text2text_generate_encoded(
            text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                 data_path + ".lang2"),
            source_vocab, target_vocab)
 def generator(self, data_dir, tmp_dir, is_training):
     all_files, urls_path = _maybe_download_corpora(tmp_dir, is_training)
     encoder = generator_utils.get_or_generate_vocab_inner(
         data_dir, self.vocab_file, self.targeted_vocab_size,
         example_generator(all_files, urls_path, sum_token=False))
     write_raw_text_to_files(all_files, urls_path, data_dir, tmp_dir,
                             is_training)
     for example in example_generator(all_files, urls_path, sum_token=True):
         story, summary = _story_summary_split(example)
         encoded_summary = encoder.encode(summary) + [EOS]
         encoded_story = encoder.encode(story) + [EOS]
         yield {"inputs": encoded_story, "targets": encoded_summary}
Пример #14
0
 def generator(self, data_dir, tmp_dir, is_training):
   all_files, urls_path = _maybe_download_corpora(tmp_dir, is_training)
   encoder = generator_utils.get_or_generate_vocab_inner(
       data_dir, self.vocab_file, self.targeted_vocab_size,
       example_generator(all_files, urls_path, sum_token=False))
   write_raw_text_to_files(all_files, urls_path, data_dir, tmp_dir,
                           is_training)
   for example in example_generator(all_files, urls_path, sum_token=True):
     story, summary = _story_summary_split(example)
     encoded_summary = encoder.encode(summary) + [EOS]
     encoded_story = encoder.encode(story) + [EOS]
     yield {"inputs": encoded_story, "targets": encoded_summary}
Пример #15
0
 def generator(self, data_dir, tmp_dir, _):
   encoder = generator_utils.get_or_generate_vocab_inner(
       data_dir, self.vocab_file, self.targeted_vocab_size,
       page_generator(tmp_dir, max_docs=1000))
   case_num = 0
   for page in page_generator(tmp_dir):
     encoded = encoder.encode(page)
     for i in xrange(len(encoded) // self.sequence_length):
       case_num += 1
       if self.max_cases and case_num > self.max_cases:
         return
       targets = encoded[
           i * self.sequence_length:(i + 1) * self.sequence_length]
       inputs = self.scramble(targets)
       yield {"inputs": inputs, "targets": targets}
Пример #16
0
 def generator(self, data_dir, tmp_dir, _):
     encoder = generator_utils.get_or_generate_vocab_inner(
         data_dir, self.vocab_file, self.targeted_vocab_size,
         lambda: page_generator(tmp_dir, max_docs=1000))
     case_num = 0
     for page in page_generator(tmp_dir):
         encoded = encoder.encode(page)
         for i in xrange(len(encoded) // self.sequence_length):
             case_num += 1
             if self.max_cases and case_num > self.max_cases:
                 return
             targets = encoded[i * self.sequence_length:(i + 1) *
                               self.sequence_length]
             inputs = self.scramble(targets)
             yield {"inputs": inputs, "targets": targets}
    def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
        if force_get:
            vocab_filepath = os.path.join(data_dir, self.vocab_filename)
            encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
            encoders = {"inputs": encoder,
                        "targets": encoder}

        else:
            encoder = generator_utils.get_or_generate_vocab_inner(
                data_dir=data_dir,
                vocab_filename=self.vocab_filename,
                vocab_size=self.approx_vocab_size,
                generator=self.generate_text_for_vocab(data_dir, tmp_dir),
                max_subtoken_length=50)
            encoders = {"inputs": encoder,
                        "targets": encoder}
        return encoders
Пример #18
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    train_paths = self.training_filepaths(
        data_dir, self.num_shards, shuffled=False)
    dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False)

    train_examples = self._examples(data_dir, tmp_dir, train=True)
    dev_examples = self._examples(data_dir, tmp_dir, train=False)

    encoder = generator_utils.get_or_generate_vocab_inner(
        data_dir, self.vocab_file, self.targeted_vocab_size,
        (e['sentence1'] + ' ' + e['sentence2']
         for e in train_examples + dev_examples)
        )

    generator_utils.generate_dataset_and_shuffle(
        self._inputs_and_targets(encoder, train_examples), train_paths,
        self._inputs_and_targets(encoder, dev_examples), dev_paths)
Пример #19
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False)

        train_examples = self._examples(data_dir, tmp_dir, train=True)
        dev_examples = self._examples(data_dir, tmp_dir, train=False)

        encoder = generator_utils.get_or_generate_vocab_inner(
            data_dir, self.vocab_file, self.targeted_vocab_size,
            (e['sentence1'] + ' ' + e['sentence2']
             for e in train_examples + dev_examples))

        generator_utils.generate_dataset_and_shuffle(
            self._inputs_and_targets(encoder, train_examples), train_paths,
            self._inputs_and_targets(encoder, dev_examples), dev_paths)
Пример #20
0
 def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
     if self.vocab_type == VocabType.CHARACTER:
         encoder = text_encoder.ByteTextEncoder()
     elif self.vocab_type == VocabType.SUBWORD:
         if force_get:
             vocab_filepath = os.path.join(data_dir, self.vocab_filename)
             encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
         else:
             encoder = generator_utils.get_or_generate_vocab_inner(
                 data_dir, self.vocab_filename, self.approx_vocab_size,
                 self.generate_text_for_vocab(data_dir, tmp_dir))
     elif self.vocab_type == VocabType.TOKEN:
         vocab_filename = os.path.join(data_dir, self.vocab_filename)
         encoder = text_encoder.TokenTextEncoder(vocab_filename)
     else:
         raise ValueError("Unrecognized VocabType")
     return encoder
Пример #21
0
    def generate_vocab(self, data_dir, tmp_dir, vocab_filename, vocab_size,
                       sources):
        """Generate a vocabulary from the datasets in sources."""
        def generate():
            tf.logging.info("Generating vocab from: %s", str(sources))
            for lang_file in sources:
                tf.logging.info("Reading file: %s" % lang_file)
                filepath = os.path.join(tmp_dir, lang_file)
                #
                #		# Extract from tar if needed.
                #		if not tf.gfile.Exists(filepath):
                #			read_type = "r:gz" if filename.endswith("tgz") else "r"
                #			with tarfile.open(compressed_file, read_type) as corpus_tar:
                #				corpus_tar.extractall(tmp_dir)
                #
                #		# For some datasets a second extraction is necessary.
                #		if lang_file.endswith(".gz"):
                #			new_filepath = os.path.join(tmp_dir, lang_file[:-3])
                #			if tf.gfile.Exists(new_filepath):
                #				tf.logging.info(
                #			"Subdirectory %s already exists, skipping unpacking" % filepath)
                #			else:
                #				tf.logging.info("Unpacking subdirectory %s" % filepath)
                #				gunzip_file(filepath, new_filepath)
                #			filepath = new_filepath

                # Use Tokenizer to count the word occurrences.
                with tf.gfile.GFile(filepath, mode="r") as source_file:
                    file_byte_budget = self.file_byte_budget
                    counter = 0
                    countermax = int(source_file.size() / file_byte_budget / 2)
                    for line in source_file:
                        if counter < countermax:
                            counter += 1
                        else:
                            if file_byte_budget <= 0:
                                break
                            line = line.strip()
                            file_byte_budget -= len(line)
                            counter = 0
                            yield line

        return generator_utils.get_or_generate_vocab_inner(
            data_dir, vocab_filename, vocab_size, generate())
Пример #22
0
 def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
     if self.vocab_type == VocabType.CHARACTER:
         encoder = text_encoder.ByteTextEncoder()
     elif self.vocab_type == VocabType.SUBWORD:
         if force_get:
             vocab_filepath = os.path.join(data_dir, self.vocab_filename)
             encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
         else:
             other_problem = self.use_vocab_from_other_problem
             if other_problem:
                 return other_problem.get_or_create_vocab(
                     data_dir, tmp_dir, force_get)
             encoder = generator_utils.get_or_generate_vocab_inner(
                 data_dir,
                 self.vocab_filename,
                 self.approx_vocab_size,
                 self.generate_text_for_vocab(data_dir, tmp_dir),
                 max_subtoken_length=self.max_subtoken_length,
                 reserved_tokens=(text_encoder.RESERVED_TOKENS +
                                  self.additional_reserved_tokens))
     elif self.vocab_type == VocabType.SENTENCEPIECE:
         if force_get:
             vocab_filepath = os.path.join(data_dir,
                                           self.vocab_filename + '.model')
             encoder = text_encoder.SentencePieceEncoder(vocab_filepath)
         else:
             _, tmp_file_path = tempfile.mkstemp()
             with open(tmp_file_path, 'w') as fh:
                 for i, text in enumerate(
                         self.generate_text_for_vocab(data_dir, tmp_dir)):
                     fh.write(text.rstrip() + '\n')
             encoder = text_encoder.SentencePieceEncoder.get_or_generate_vocab(
                 data_dir, self.vocab_filename, self.approx_vocab_size,
                 [tmp_file_path])
     elif self.vocab_type == VocabType.TOKEN:
         vocab_filename = os.path.join(data_dir, self.vocab_filename)
         encoder = text_encoder.TokenTextEncoder(vocab_filename,
                                                 replace_oov=self.oov_token)
     else:
         raise ValueError("Unrecognized VocabType: %s" %
                          str(self.vocab_type))
     return encoder
Пример #23
0
def get_or_generate_vocabulary(data_dir,
                               tmp_dir,
                               data_prefix,
                               max_page_size_exp,
                               approx_vocab_size=32768,
                               strip=True):
    """Get or generate the vocabulary.

  Args:
    data_dir: a string
    tmp_dir: a string
    data_prefix: a string
    max_page_size_exp: an integer
    approx_vocab_size: an integer
    strip: a boolean

  Returns:
    a TextEncoder
  """
    num_pages_for_vocab_generation = approx_vocab_size // 3
    vocab_file = vocab_filename(approx_vocab_size, strip)

    def my_generator(data_prefix):
        """Line generator for vocab."""
        count = 0
        for page in corpus_page_generator(
                all_corpus_files(data_prefix)[::-1], tmp_dir,
                max_page_size_exp):
            revisions = page["revisions"]
            if revisions:
                text = get_text(revisions[-1], strip=strip)
                yield text
                count += 1
                if count % 100 == 0:
                    tf.logging.info("reading pages for vocab %d" % count)
                if count > num_pages_for_vocab_generation:
                    break

    return generator_utils.get_or_generate_vocab_inner(
        data_dir, vocab_file, approx_vocab_size, my_generator(data_prefix))
Пример #24
0
def get_or_generate_vocabulary(data_dir,
                               tmp_dir,
                               data_prefix,
                               max_page_size_exp,
                               approx_vocab_size=32768,
                               strip=True):
  """Get or generate the vocabulary.

  Args:
    data_dir: a string
    tmp_dir: a string
    data_prefix: a string
    max_page_size_exp: an integer
    approx_vocab_size: an integer
    strip: a boolean

  Returns:
    a TextEncoder
  """
  num_pages_for_vocab_generation = approx_vocab_size // 3
  vocab_file = vocab_filename(approx_vocab_size, strip)

  def my_generator(data_prefix):
    """Line generator for vocab."""
    count = 0
    for page in corpus_page_generator(
        all_corpus_files(data_prefix)[::-1], tmp_dir, max_page_size_exp):
      revisions = page["revisions"]
      if revisions:
        text = get_text(revisions[-1], strip=strip)
        yield text
        count += 1
        if count % 100 == 0:
          tf.logging.info("reading pages for vocab %d" % count)
        if count > num_pages_for_vocab_generation:
          break

  return generator_utils.get_or_generate_vocab_inner(data_dir, vocab_file,
                                                     approx_vocab_size,
                                                     my_generator(data_prefix))
Пример #25
0
 def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
   if self.vocab_type == VocabType.CHARACTER:
     encoder = text_encoder.ByteTextEncoder()
   elif self.vocab_type == VocabType.SUBWORD:
     if force_get:
       vocab_filepath = os.path.join(data_dir, self.vocab_filename)
       encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
     else:
       encoder = generator_utils.get_or_generate_vocab_inner(
           data_dir, self.vocab_filename, self.approx_vocab_size,
           self.generate_text_for_vocab(data_dir, tmp_dir),
           max_subtoken_length=self.max_subtoken_length,
           reserved_tokens=(
               text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens))
   elif self.vocab_type == VocabType.TOKEN:
     vocab_filename = os.path.join(data_dir, self.vocab_filename)
     encoder = text_encoder.TokenTextEncoder(vocab_filename,
                                             replace_oov=self.oov_token)
   else:
     raise ValueError(
         "Unrecognized VocabType: %s" % str(self.vocab_type))
   return encoder
Пример #26
0
  def generator(self, data_dir, tmp_dir, train):
    """Generate examples."""
    # Download and extract
    compressed_filename = os.path.basename(self.URL)
    download_path = generator_utils.maybe_download(tmp_dir, compressed_filename,
                                                   self.URL)
    imdb_dir = os.path.join(tmp_dir, "aclImdb")
    if not tf.gfile.Exists(imdb_dir):
      with tarfile.open(download_path, "r:gz") as tar:
        tar.extractall(tmp_dir)

    # Generate vocab
    encoder = generator_utils.get_or_generate_vocab_inner(
        data_dir, self.vocab_file, self.targeted_vocab_size,
        self.doc_generator(imdb_dir, "train"))

    # Generate examples
    dataset = "train" if train else "test"
    for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True):
      yield {
          "inputs": encoder.encode(doc) + [EOS],
          "targets": [int(label)],
      }
    def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
        if force_get:
            vocab_filepath = os.path.join(data_dir, self.vocab_filename)
            encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
            encoders = {"inputs": encoder,
                        "targets": encoder}
            for name in self.get_contexts_name():
                encoders[name] = encoder

        else:
            encoder = generator_utils.get_or_generate_vocab_inner(
                data_dir=data_dir,
                vocab_filename=self.vocab_filename,
                vocab_size=self.approx_vocab_size,
                generator=self.generate_text_for_vocab(data_dir, tmp_dir),
                max_subtoken_length=50,
                reserved_tokens=text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens)
            encoders = {"inputs": encoder,
                        "targets": encoder}
            for name in self.get_contexts_name():
                encoders[name] = encoder

        return encoders
Пример #28
0
def get_or_generate_vocab(data_dir,
                          tmp_dir,
                          vocab_filename,
                          vocab_size,
                          sources,
                          file_byte_budget=1e7):
    """Generate a vocabulary from the datasets in sources."""
    def generate():
        tf.logging.info("Generating vocab from: %s", str(sources))
        for source in sources:
            path = source[0]

            for lang_file in source[1]:
                tf.logging.info("Reading file: %s" % lang_file)
                filepath = os.path.join(path, lang_file)

                # Use Tokenizer to count the word occurrences.
                with tf.gfile.GFile(filepath, mode="r") as source_file:
                    file_byte_budget_ = file_byte_budget
                    counter = 0
                    countermax = int(source_file.size() / file_byte_budget_ /
                                     2)
                    for line in source_file:
                        if counter < countermax:
                            counter += 1
                        else:
                            if file_byte_budget_ <= 0:
                                break
                            line = line.strip()
                            file_byte_budget_ -= len(line)
                            counter = 0
                            yield line

    return generator_utils.get_or_generate_vocab_inner(data_dir,
                                                       vocab_filename,
                                                       vocab_size, generate())
Пример #29
0
 def get_or_generate_vocab(self, data_dir, tmp_dir):
   return generator_utils.get_or_generate_vocab_inner(
       data_dir, self.vocab_file, self.targeted_vocab_size,
       self.file_generator(
           self.train_text_filepaths(tmp_dir),
           max_chars_total=self.max_chars_for_vocab))
Пример #30
0
import sys
import warnings
warnings.filterwarnings("ignore")
from tensor2tensor.data_generators import generator_utils

# Next check is not necessary, it is only to show that you should not use whole parent corus
print("Checking length of file")
with open("mixed.txt") as f:
    for i, l in enumerate(f):
        pass

    if i > 200000 or i < 100000:
        print(
            "Your 'mixed.txt' does not contain 150k rows which means that you have not balanced all languages. In our toy example, this would not be a problem, however whenever you are dealing with high-resource parent and low-resource child. It can quickly happen, that most of the mixed corpora would contain only parent sentences and then the generated vocabulary would contain mainly parent subwords."
        )
        sys.exit(0)


def get_generator():
    with open("mixed.txt") as f:
        for line in f:
            yield line.strip()


gen = get_generator()
print(
    "Generating vocabulary. It will take a moment. Please, read the next section of tutorial.\n\n"
)
generator_utils.get_or_generate_vocab_inner("t2t_data", "vocab.cseten.wp",
                                            32000, gen)
Пример #31
0
 def generate_vocab(self, data_dir, wikis_dir, refs_dir):
     # Produce a SubwordTextEncoder from a subset of the data
     return generator_utils.get_or_generate_vocab_inner(
         data_dir, self.vocab_filename, self.target_vocab_size,
         self.generate_lines_for_vocab(wikis_dir, refs_dir))
Пример #32
0
  def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
    train = dataset_split == problem.DatasetSplit.TRAIN
    # Called twice: for train and test

    # Get the list of the training samples (coding challenge samples)
    samples = list(generator_samples(tmp_dir, self.pb_constants))

    # Split between train and dev
    # Shuffle to get problems from diverse sources (CodeChef and CodeForces) and
    # difficulties in each set.
    # Need to sort the samples first before shuffling (as walk() isn't
    # deterministic)
    samples.sort(key=lambda x: x.desc_file)  # in-place
    rng = random.Random(7531)  # Local fixed seed
    rng.shuffle(samples)  # in-place

    # Train: 5019/5228 problems
    # Dev: 209/5228 problems
    len_samples = len(samples)
    split = len_samples // 25
    samples = samples[split:] if train else samples[:split]
    tf.logging.info("Number of samples for {}: {}/{}".format(
        "train" if train else "dev",
        len(samples),
        len_samples
    ))

    def generator_samples_content(get_source, get_target):
      """Generate samples."""
      source, target = None, None
      # Iterate over the coding samples
      for sample in samples:
        if get_source:
          with tf.gfile.GFile(sample.desc_file, mode="r") as source_file:
            source = source_file.read()

        if get_target:
          # Each challenge can have multiple implementations (or none)
          for code_file in sample.code_files:
            with tf.gfile.GFile(code_file, mode="r") as target_file:
              target = target_file.read()
              target = self.preprocess_target(target)
            yield source, target
        elif sample.code_files:  # Only take the source if a target exists
          yield source, target

    def generator_target():
      for _, target in generator_samples_content(False, True):
        yield target.strip()

    # Generate vocab for both source and target

    # TODO(lukaszkaiser): Fix vocab generation call. No sources given.
    assert not self.vocab_input_filename
    source_vocab = None
    # source_vocab = generator_utils.get_or_generate_vocab(
    #     data_dir, tmp_dir, self.vocab_input_filename, self.input_vocab_size)

    target_vocab = generator_utils.get_or_generate_vocab_inner(
        data_dir=data_dir,
        vocab_filename=self.vocab_target_filename,
        vocab_size=self.target_vocab_size,
        generator=generator_target(),)

    # Yield the training and testing samples
    eos_list = [EOS]
    for source, target in generator_samples_content(True, True):
      source_ints = source_vocab.encode(source.strip()) + eos_list
      target_ints = target_vocab.encode(target.strip()) + eos_list
      yield {
          "inputs": source_ints,
          "targets": target_ints,
      }
Пример #33
0
    def train_generator(self, data_dir, tmp_dir, train):
        # Called twice: for train and test

        # Get the list of the training samples (coding challenge samples)
        samples = list(generator_samples(tmp_dir))

        # Split between train and dev
        # Suffle to get problems from diverse sources (CodeChef and CodeForces) and
        # dificulties in each set.
        # Need to sort the samples first before shuffling (as walk() isn't
        # deterministic)
        samples.sort(key=lambda x: x.desc_file)  # in-place
        rng = random.Random(7531)  # Local fixed seed
        rng.shuffle(samples)  # in-place

        # Train: 5019/5228 problems
        # Dev: 209/5228 problems
        len_samples = len(samples)
        split = len_samples // 25
        samples = samples[split:] if train else samples[:split]
        tf.logging.info("Number of samples for {}: {}/{}".format(
            "train" if train else "dev", len(samples), len_samples))

        def generator_samples_content(get_source, get_target):
            source, target = None, None
            # Iterate over the coding samples
            for sample in samples:
                if get_source:
                    with tf.gfile.GFile(sample.desc_file,
                                        mode="r") as source_file:
                        source = source_file.read()

                if get_target:
                    # Each challenge can have multiple implementations (or none)
                    for code_file in sample.code_files:
                        with tf.gfile.GFile(code_file,
                                            mode="r") as target_file:
                            target = target_file.read()
                        yield source, target
                elif sample.code_files:  # Only take the source if a target exists
                    yield source, target

        def generator_target():
            for _, target in generator_samples_content(False, True):
                yield target.strip()

        # Generate vocab for both source and target

        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, self.vocab_input_filename,
            self.input_vocab_size)

        target_vocab = generator_utils.get_or_generate_vocab_inner(
            data_dir=data_dir,
            vocab_filename=self.vocab_target_filename,
            vocab_size=self.target_vocab_size,
            generator_fn=generator_target,
        )

        # Yield the training and testing samples
        eos_list = [EOS]
        for source, target in generator_samples_content(True, True):
            source_ints = source_vocab.encode(source.strip()) + eos_list
            target_ints = target_vocab.encode(target.strip()) + eos_list
            yield {
                "inputs": source_ints,
                "targets": target_ints,
            }
Пример #34
0
 def generate_vocab(self, data_dir, wikis_dir, refs_dir):
   # Produce a SubwordTextEncoder from a subset of the data
   return generator_utils.get_or_generate_vocab_inner(
       data_dir, self.vocab_filename, self.target_vocab_size,
       self.generate_lines_for_vocab(wikis_dir, refs_dir))
Пример #35
0
def get_or_generate_vocab(data_dir, vocab_filename, vocab_size, source):
    vocab_generator = generate_lines_for_vocab(data_dir, source)
    return generator_utils.get_or_generate_vocab_inner(data_dir,
                                                       vocab_filename,
                                                       vocab_size,
                                                       vocab_generator)