Exemplo n.º 1
0
    def generator(self, data_dir, tmp_dir, is_training):
        """Generator for lm1b sentences.

    Args:
      data_dir: data dir.
      tmp_dir: tmp dir.
      is_training: a boolean.

    Yields:
      A dictionary {"inputs": [0], "targets": [<subword ids>]}
    """
        _maybe_download_corpus(tmp_dir)
        original_vocab = _original_vocab(tmp_dir)
        files = (_train_data_filenames(tmp_dir)
                 if is_training else [_dev_data_filename(tmp_dir)])
        if self.is_character_level:
            encoder = text_encoder.ByteTextEncoder()
        else:
            vocab_filepath = os.path.join(data_dir, self.vocab_file)
            encoder = _get_or_build_subword_text_encoder(
                tmp_dir, vocab_filepath)
        for filepath in files:
            tf.logging.info("filepath = %s", filepath)
            for line in tf.gfile.Open(filepath):
                tokens = encoder.encode(
                    _replace_oov(original_vocab,
                                 text_encoder.native_to_unicode(line)))
                tokens.append(EOS)
                yield {"inputs": [0], "targets": tokens}
Exemplo n.º 2
0
def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_filepath: path to store (or load) vocab.

  Returns:
    a SubwordTextEncoder.
  """
    if tf.gfile.Exists(vocab_filepath):
        return text_encoder.SubwordTextEncoder(vocab_filepath)
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    token_counts = defaultdict(int)
    line_count = 0
    max_lines = 63000
    for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
        tokens = tokenizer.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        for tok in tokens:
            token_counts[tok] += 1
        line_count += 1
        if line_count >= max_lines:
            break
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(token_counts, min_count=5)
    ret.store_to_file(vocab_filepath)
    return ret
Exemplo n.º 3
0
def _original_vocab(tmp_dir):
    """Returns a set containing the original vocabulary.

  This is important for comparing with published results.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    a set of strings
  """
    vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/"
                 "vocab-2016-09-10.txt")
    vocab_filename = os.path.basename(vocab_url + ".en")
    vocab_filepath = os.path.join(tmp_dir, vocab_filename)
    if not os.path.exists(vocab_filepath):
        generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url)
    return set([
        text_encoder.native_to_unicode(l.strip())
        for l in tf.gfile.Open(vocab_filepath)
    ])
Exemplo n.º 4
0
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator):
    """Inner implementation for vocab generators.

  Args:
    data_dir: The base directory where data and vocab files are stored. If None,
        then do not save the vocab even if it doesn't exist.
    vocab_filename: relative filename where vocab file is stored
    vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
    generator: a generator that produces tokens from the vocabulary

  Returns:
    A SubwordTextEncoder vocabulary object.
  """
    if data_dir is None:
        vocab_filepath = None
    else:
        vocab_filepath = os.path.join(data_dir, vocab_filename)

    if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    tf.logging.info("Generating vocab file: %s", vocab_filepath)
    token_counts = defaultdict(int)
    for item in generator:
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)

    if vocab_filepath is not None:
        vocab.store_to_file(vocab_filepath)
    return vocab
Exemplo n.º 5
0
 def test_native_to_unicode(self):
     s = r"foo bar"
     s_unicode = text_encoder.native_to_unicode(s)
     if six.PY2:
         self.assertIsInstance(s_unicode, unicode)
     self.assertEqual(s_unicode, u"foo bar")