def __init__(self, vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token="[UNK]", split_unknown_characters=False, lower_case=False, keep_whitespace=False, normalization_form=None, preserve_unused_token=False): super(BertTokenizer, self).__init__() _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1) if isinstance(vocab_lookup_table, str) or isinstance( vocab_lookup_table, ops.Tensor): init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table) vocab_lookup_table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace, normalization_form, preserve_unused_token) self._wordpiece_tokenizer = WordpieceTokenizer( vocab_lookup_table, suffix_indicator, max_bytes_per_word, max_chars_per_token, token_out_type, unknown_token, split_unknown_characters)
def __init__(self, vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token="[UNK]", split_unknown_characters=False, lower_case=False, keep_whitespace=False, normalization_form=None, preserve_unused_token=False): if isinstance(vocab_lookup_table, str) or isinstance( vocab_lookup_table, ops.Tensor): init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table) vocab_lookup_table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) print("Before ", type(lower_case)) if isinstance(lower_case, ops.Tensor): lower_case = tf.compat.v1.get_default_session().run(lower_case) print("After ", type(lower_case)) self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace, normalization_form, preserve_unused_token) self._wordpiece_tokenizer = WordpieceTokenizer( vocab_lookup_table, suffix_indicator, max_bytes_per_word, max_chars_per_token, token_out_type, unknown_token, split_unknown_characters)
def _create_table(vocab, num_oov=1): init = lookup_ops.KeyValueTensorInitializer( vocab, math_ops.range( array_ops.size(vocab, out_type=dtypes.int64), dtype=dtypes.int64), key_dtype=dtypes.string, value_dtype=dtypes.int64) return lookup_ops.StaticVocabularyTableV1( init, num_oov, lookup_key_dtype=dtypes.string)
def __init__(self, vocab_lookup_table, suffix_indicator='##', max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token='[UNK]', split_unknown_characters=False): """Initializes the WordpieceTokenizer. Args: vocab_lookup_table: A lookup table implementing the LookupInterface containing the vocabulary of subwords or a string which is the file path to the vocab.txt file. suffix_indicator: (optional) The characters prepended to a wordpiece to indicate that it is a suffix to another subword. Default is '##'. max_bytes_per_word: (optional) Max size of input token. Default is 100. max_chars_per_token: (optional) Max size of subwords, excluding suffix indicator. If known, providing this improves the efficiency of decoding long words. token_out_type: (optional) The type of the token to return. This can be `tf.int64` or `tf.int32` IDs, or `tf.string` subwords. The default is `tf.int64`. unknown_token: (optional) The string value to substitute for an unknown token. Default is "[UNK]". If set to `None`, no substitution occurs. If `token_out_type` is `tf.int32`/`tf.int64`, the `vocab_lookup_table` is used (after substitution) to convert the unknown token to an integer. split_unknown_characters: (optional) Whether to split out single unknown characters as subtokens. If False (default), words containing unknown characters will be treated as single unknown tokens. """ super(WordpieceTokenizer, self).__init__() _tf_text_wordpiece_tokenizer_op_create_counter.get_cell().increase_by( 1) if isinstance(vocab_lookup_table, str) or (isinstance(vocab_lookup_table, ops.Tensor) and vocab_lookup_table.dtype == dtypes.string): init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table) vocab_lookup_table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) if not isinstance(vocab_lookup_table, lookup_ops.LookupInterface): raise TypeError('Unable to build a lookup table from {}'.format( vocab_lookup_table)) self._vocab_lookup_table = vocab_lookup_table self._suffix_indicator = suffix_indicator self._max_bytes_per_word = max_bytes_per_word self._max_chars_per_token = (0 if max_chars_per_token is None else max_chars_per_token) self._token_out_type = token_out_type self._unknown_token = unknown_token if unknown_token else '[UNK]' self._use_unknown_token = True if unknown_token else False self._split_unknown_characters = split_unknown_characters
def testDetokenizeFailsForSparseVocab(self): vocab = ["a", "##b", "##c"] ids = [0, 10, 20] init = lookup_ops.KeyValueTensorInitializer(vocab, ids, key_dtype=dtypes.string, value_dtype=dtypes.int64) table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) self.evaluate(table.initializer) tokenizer = WordpieceTokenizer(table) words = ragged_factory_ops.constant([["abb", "abc"], ["abcbc"]]) subwords_ids = tokenizer.tokenize(words) with self.assertRaisesRegex(errors_impl.InvalidArgumentError, "detokenize.*?dense on the interval"): result = tokenizer.detokenize(subwords_ids) self.evaluate(result)
def __init__(self, vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token="[UNK]", split_unknown_characters=False, lower_case=False, keep_whitespace=False, normalization_form=None): if isinstance(vocab_lookup_table, str): init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table) vocab_lookup_table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace, normalization_form) self._wordpiece_tokenizer = WordpieceTokenizer( vocab_lookup_table, suffix_indicator, max_bytes_per_word, max_chars_per_token, token_out_type, unknown_token, split_unknown_characters)
def _create_table(self, vocab, num_oov=100): init = lookup_ops.TextFileIdTableInitializer(vocab) return lookup_ops.StaticVocabularyTableV1(init, num_oov)