def main(_): # Read in wordcount file. with open(FLAGS.input_path) as wordcount_file: word_counts = [(line.split()[0], int(line.split()[1])) for line in wordcount_file] # Add in padding tokens. reserved_tokens = FLAGS.reserved_tokens if FLAGS.num_pad_tokens: padded_tokens = ['<pad>'] padded_tokens += ['<pad%d>' % i for i in range(1, FLAGS.num_pad_tokens)] reserved_tokens = padded_tokens + reserved_tokens params = learner.Params(FLAGS.upper_thresh, FLAGS.lower_thresh, FLAGS.num_iterations, FLAGS.max_input_tokens, FLAGS.max_token_length, FLAGS.max_unique_chars, FLAGS.vocab_size, FLAGS.slack_ratio, FLAGS.include_joiner_token, FLAGS.joiner, reserved_tokens) vocab = learner.learn(word_counts, params) vocab = ''.join([line + '\n' for line in vocab]) # Write vocab to file. with open(FLAGS.output_path, 'w') as vocab_file: vocab_file.write(vocab)
def bert_vocab_from_dataset(dataset, vocab_size: int, reserved_tokens: List[str], bert_tokenizer_params=None, learn_params=None) -> List[str]: """Generate a Bert wordpiece vocabulary from a `tf.data.Dataset` of texts. ``` import tensorflow_text as text vocab = bert_vocab_from_dataset(dataset, vocab_size, reserved_tokens, bert_tokenizer_params, learn_params) bert_tokenizer = text.BertTokenizer(vocab, **bert_tokenizer_params) token_ids = bert_tokenizer.tokenize(text) ``` This uses Bert's splitting algorithm to split the text into words before generating the subword vocabulary from the resulting words. The resulting vocabulary _can_ be used directly with a `text.WordpieceTokenizer`, but note that the vocabulary will be sub-optimal or **broken** if you split the text into words a different way. ``` wordpiece_tokenizer = text.WordpieceTokenizer(vocab, ...) words = split(text) token_ids = wordpiece_tokenizer.tokenize(words) ``` Args: dataset: `A tf.data.Dataset` containing string-tensor elements. vocab_size: The target vocabulary size. This is the maximum size. reserved_tokens: A list of tokens that must be included in the vocabulary. bert_tokenizer_params: The `text.BertTokenizer` arguments relavant for to vocabulary-generation: * `lower_case` * `keep_whitespace` * `normalization_form` * `preserve_unused_token` See `BertTokenizer` for details. You should use the same values for these to both generate the vocabulary and build the `BertTokenizer`. learn_params: A dict of additional key word arguments for the the vocabulary learning function. See `wordpiece_tokenizer_learner_lib.learn` for details. Returns: A list strings containing the vocabulary. Raises: TypeError: If the dataset contains structured elements instead of single tensors. """ if bert_tokenizer_params is None: bert_tokenizer_params = {} if learn_params is None: learn_params = {} element_spec = dataset.element_spec try: element_spec.shape except AttributeError: raise TypeError("The dataset should contain single-tensor elements.") tokenizer = bert_tokenizer.BasicTokenizer(**bert_tokenizer_params) words_dataset = dataset.map(tokenizer.tokenize) word_counts = learner.count_words(words_dataset) vocab = learner.learn(word_counts, vocab_size, reserved_tokens, **learn_params) return vocab
def process(self, wordcounts): return learner.learn(wordcounts, self._params)