def generator(self, data_dir, tmp_dir, is_training): """Generator for lm1b sentences. Args: data_dir: data dir. tmp_dir: tmp dir. is_training: a boolean. Yields: A dictionary {"inputs": [0], "targets": [<subword ids>]} """ _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) files = (_train_data_filenames(tmp_dir) if is_training else [_dev_data_filename(tmp_dir)]) if self.is_character_level: encoder = text_encoder.ByteTextEncoder() else: vocab_filepath = os.path.join(data_dir, self.vocab_file) encoder = _get_or_build_subword_text_encoder( tmp_dir, vocab_filepath) for filepath in files: tf.logging.info("filepath = %s", filepath) for line in tf.gfile.Open(filepath): tokens = encoder.encode( _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) tokens.append(EOS) yield {"inputs": [0], "targets": tokens}
def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath): """Builds a SubwordTextEncoder based on the corpus. Args: tmp_dir: directory containing dataset. vocab_filepath: path to store (or load) vocab. Returns: a SubwordTextEncoder. """ if tf.gfile.Exists(vocab_filepath): return text_encoder.SubwordTextEncoder(vocab_filepath) _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) token_counts = defaultdict(int) line_count = 0 max_lines = 63000 for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]): tokens = tokenizer.encode( _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) for tok in tokens: token_counts[tok] += 1 line_count += 1 if line_count >= max_lines: break ret = text_encoder.SubwordTextEncoder() ret.build_from_token_counts(token_counts, min_count=5) ret.store_to_file(vocab_filepath) return ret
def _original_vocab(tmp_dir): """Returns a set containing the original vocabulary. This is important for comparing with published results. Args: tmp_dir: directory containing dataset. Returns: a set of strings """ vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/" "vocab-2016-09-10.txt") vocab_filename = os.path.basename(vocab_url + ".en") vocab_filepath = os.path.join(tmp_dir, vocab_filename) if not os.path.exists(vocab_filepath): generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url) return set([ text_encoder.native_to_unicode(l.strip()) for l in tf.gfile.Open(vocab_filepath) ])
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size, generator): """Inner implementation for vocab generators. Args: data_dir: The base directory where data and vocab files are stored. If None, then do not save the vocab even if it doesn't exist. vocab_filename: relative filename where vocab file is stored vocab_size: target size of the vocabulary constructed by SubwordTextEncoder generator: a generator that produces tokens from the vocabulary Returns: A SubwordTextEncoder vocabulary object. """ if data_dir is None: vocab_filepath = None else: vocab_filepath = os.path.join(data_dir, vocab_filename) if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab tf.logging.info("Generating vocab file: %s", vocab_filepath) token_counts = defaultdict(int) for item in generator: for tok in tokenizer.encode(text_encoder.native_to_unicode(item)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) if vocab_filepath is not None: vocab.store_to_file(vocab_filepath) return vocab
def test_native_to_unicode(self): s = r"foo bar" s_unicode = text_encoder.native_to_unicode(s) if six.PY2: self.assertIsInstance(s_unicode, unicode) self.assertEqual(s_unicode, u"foo bar")