def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath): """Builds a SubwordTextEncoder based on the corpus. Args: tmp_dir: directory containing dataset. vocab_filepath: path to store (or load) vocab. Returns: a SubwordTextEncoder. """ if tf.gfile.Exists(vocab_filepath): return text_encoder.SubwordTextEncoder(vocab_filepath) _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) token_counts = defaultdict(int) line_count = 0 max_lines = 63000 for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]): tokens = tokenizer.encode( _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) for tok in tokens: token_counts[tok] += 1 line_count += 1 if line_count >= max_lines: break ret = text_encoder.SubwordTextEncoder() ret.build_from_token_counts(token_counts, min_count=5) ret.store_to_file(vocab_filepath) return ret
def feature_encoders(self, data_dir): source_vocab_filename = os.path.join(data_dir, self.source_vocab_name) target_vocab_filename = os.path.join(data_dir, self.target_vocab_name) source_token = text_encoder.SubwordTextEncoder(source_vocab_filename) target_token = text_encoder.SubwordTextEncoder(target_vocab_filename) return { "inputs": source_token, "targets": target_token, }
def feature_encoders(self, data_dir): source_vocab_filename = os.path.join( data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size) target_vocab_filename = os.path.join( data_dir, "ice_target.tokens.vocab.%d" % self.targeted_vocab_size) source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) return { "inputs": source_subtokenizer, "targets": target_subtokenizer, }
def test_load_from_file(self): # Test a vocab file with words not wrapped with single quotes encoder = text_encoder.SubwordTextEncoder() correct_vocab = ["the", "and", "of"] vocab = io.StringIO("the\n" "and\n" "of\n") encoder._load_from_file_object(vocab) self.assertEqual(encoder._all_subtoken_strings, correct_vocab) # Test a vocab file with words wrapped in single quotes encoder = text_encoder.SubwordTextEncoder() vocab = io.StringIO("\"the\"\n" "\"and\"\n" "\"of\"\n") encoder._load_from_file_object(vocab) self.assertEqual(encoder._all_subtoken_strings, correct_vocab)
def feature_encoders(self, data_dir): # This vocab file must be present within the data directory. vocab_filename = os.path.join(data_dir, "charset_size134.txt") return { "inputs": text_encoder.TextEncoder(), "targets": text_encoder.SubwordTextEncoder(vocab_filename) }
def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): """Read or create vocabulary.""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) print('Vocab file written to: ' + vocab_filepath) if tf.gfile.Exists(vocab_filepath): gs = text_encoder.SubwordTextEncoder(vocab_filepath) return gs example_file = os.path.join(tmp_dir, _EXAMPLES_FILE) gs = text_encoder.SubwordTextEncoder() token_counts = tokenizer.corpus_token_counts( example_file, corpus_max_lines=1000000) gs = gs.build_to_target_size( vocab_size, token_counts, min_val=1, max_val=1e3) gs.store_to_file(vocab_filepath) return gs
def feature_encoders(self, data_dir): vocab_filename = os.path.join(data_dir, self.vocab_file) encoder = text_encoder.SubwordTextEncoder(vocab_filename) return { "inputs": encoder, "targets": text_encoder.ClassLabelEncoder(["neg", "pos"]), }
def feature_encoders(self, data_dir): vocab_filename = os.path.join( data_dir, "vocab.endefr.%d" % self.target_vocab_size) subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) return { "inputs": text_encoder.TextEncoder(), "targets": subtokenizer, }
def feature_encoders(self, data_dir): if self.is_character_level: encoder = text_encoder.ByteTextEncoder() else: vocab_filename = os.path.join( data_dir, "vocab.endefr.%d" % self.targeted_vocab_size) encoder = text_encoder.SubwordTextEncoder(vocab_filename) return {"targets": encoder}
def feature_encoders(self, data_dir): if self.is_character_level: encoder = text_encoder.ByteTextEncoder() elif self.use_subword_tokenizer: vocab_filename = os.path.join(data_dir, self.vocab_file) encoder = text_encoder.SubwordTextEncoder(vocab_filename) else: vocab_filename = os.path.join(data_dir, self.vocab_file) encoder = text_encoder.TokenTextEncoder(vocab_filename) if self.has_inputs: return {"inputs": encoder, "targets": encoder} return {"targets": encoder}
def test_reserved_token_chars_not_in_alphabet(self): corpus = "dog" token_counts = collections.Counter(corpus.split(" ")) encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size( 100, token_counts, 2, 100) filename = os.path.join(self.test_temp_dir, "out.voc") encoder1.store_to_file(filename) encoder2 = text_encoder.SubwordTextEncoder(filename=filename) self.assertEqual(encoder1._alphabet, encoder2._alphabet) for t in text_encoder.RESERVED_TOKENS: for c in t: # Verify that encoders can encode all reserved token chars. encoder1.encode(c) encoder2.encode(c)
def main(_): """Convert a file to examples.""" if FLAGS.subword_text_encoder_filename: encoder = text_encoder.SubwordTextEncoder( FLAGS.subword_text_encoder_filename) elif FLAGS.token_text_encoder_filename: encoder = text_encoder.TokenTextEncoder( FLAGS.token_text_encoder_filename) elif FLAGS.byte_text_encoder: encoder = text_encoder.ByteTextEncoder() else: encoder = None reader = tf.python_io.tf_record_iterator(FLAGS.input_filename) total_sequences = 0 total_input_tokens = 0 total_target_tokens = 0 max_input_length = 0 max_target_length = 0 for record in reader: x = tf.train.Example() x.ParseFromString(record) inputs = [ int(i) for i in x.features.feature["inputs"].int64_list.value ] targets = [ int(i) for i in x.features.feature["targets"].int64_list.value ] if FLAGS.print_inputs: print("INPUTS:\n" + encoder.decode(inputs) if encoder else inputs) if FLAGS.print_targets: print("TARGETS:\n" + encoder.decode(targets) if encoder else targets) total_input_tokens += len(inputs) total_target_tokens += len(targets) total_sequences += 1 max_input_length = max(max_input_length, len(inputs)) max_target_length = max(max_target_length, len(targets)) tf.logging.info("total_sequences: %d", total_sequences) tf.logging.info("total_input_tokens: %d", total_input_tokens) tf.logging.info("total_target_tokens: %d", total_target_tokens) tf.logging.info("max_input_length: %d", max_input_length) tf.logging.info("max_target_length: %d", max_target_length)
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size, generator): """Inner implementation for vocab generators. Args: data_dir: The base directory where data and vocab files are stored. If None, then do not save the vocab even if it doesn't exist. vocab_filename: relative filename where vocab file is stored vocab_size: target size of the vocabulary constructed by SubwordTextEncoder generator: a generator that produces tokens from the vocabulary Returns: A SubwordTextEncoder vocabulary object. """ if data_dir is None: vocab_filepath = None else: vocab_filepath = os.path.join(data_dir, vocab_filename) if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder(vocab_filepath) return vocab tf.logging.info("Generating vocab file: %s", vocab_filepath) token_counts = defaultdict(int) for item in generator: for tok in tokenizer.encode(text_encoder.native_to_unicode(item)): token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( vocab_size, token_counts, 1, 1e3) if vocab_filepath is not None: vocab.store_to_file(vocab_filepath) return vocab
def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) encoder.store_to_file(FLAGS.output_filename)