def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None,
                 preserve_unused_token=False):
        super(BertTokenizer, self).__init__()
        _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1)
        if isinstance(vocab_lookup_table, str) or isinstance(
                vocab_lookup_table, ops.Tensor):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace,
                                               normalization_form,
                                               preserve_unused_token)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)
Пример #2
0
    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None,
                 preserve_unused_token=False):
        if isinstance(vocab_lookup_table, str) or isinstance(
                vocab_lookup_table, ops.Tensor):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        print("Before ", type(lower_case))
        if isinstance(lower_case, ops.Tensor):
            lower_case = tf.compat.v1.get_default_session().run(lower_case)
        print("After ", type(lower_case))

        self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace,
                                               normalization_form,
                                               preserve_unused_token)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)
Пример #3
0
def _create_table(vocab, num_oov=1):
  init = lookup_ops.KeyValueTensorInitializer(
      vocab,
      math_ops.range(
          array_ops.size(vocab, out_type=dtypes.int64), dtype=dtypes.int64),
      key_dtype=dtypes.string,
      value_dtype=dtypes.int64)
  return lookup_ops.StaticVocabularyTableV1(
      init, num_oov, lookup_key_dtype=dtypes.string)
Пример #4
0
    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator='##',
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token='[UNK]',
                 split_unknown_characters=False):
        """Initializes the WordpieceTokenizer.

    Args:
      vocab_lookup_table: A lookup table implementing the LookupInterface
        containing the vocabulary of subwords or a string which is the file path
        to the vocab.txt file.
      suffix_indicator: (optional) The characters prepended to a wordpiece to
        indicate that it is a suffix to another subword. Default is '##'.
      max_bytes_per_word: (optional) Max size of input token. Default is 100.
      max_chars_per_token: (optional) Max size of subwords, excluding suffix
        indicator. If known, providing this improves the efficiency of decoding
        long words.
      token_out_type: (optional) The type of the token to return. This can be
        `tf.int64` or `tf.int32` IDs, or `tf.string` subwords. The default is
        `tf.int64`.
      unknown_token: (optional) The string value to substitute for an unknown
        token. Default is "[UNK]". If set to `None`, no substitution occurs.
        If `token_out_type` is `tf.int32`/`tf.int64`, the `vocab_lookup_table`
        is used (after substitution) to convert the unknown token to an integer.
      split_unknown_characters: (optional) Whether to split out single unknown
        characters as subtokens. If False (default), words containing unknown
        characters will be treated as single unknown tokens.
    """
        super(WordpieceTokenizer, self).__init__()
        _tf_text_wordpiece_tokenizer_op_create_counter.get_cell().increase_by(
            1)

        if isinstance(vocab_lookup_table,
                      str) or (isinstance(vocab_lookup_table, ops.Tensor)
                               and vocab_lookup_table.dtype == dtypes.string):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        if not isinstance(vocab_lookup_table, lookup_ops.LookupInterface):
            raise TypeError('Unable to build a lookup table from {}'.format(
                vocab_lookup_table))

        self._vocab_lookup_table = vocab_lookup_table
        self._suffix_indicator = suffix_indicator
        self._max_bytes_per_word = max_bytes_per_word
        self._max_chars_per_token = (0 if max_chars_per_token is None else
                                     max_chars_per_token)
        self._token_out_type = token_out_type
        self._unknown_token = unknown_token if unknown_token else '[UNK]'
        self._use_unknown_token = True if unknown_token else False
        self._split_unknown_characters = split_unknown_characters
Пример #5
0
    def testDetokenizeFailsForSparseVocab(self):
        vocab = ["a", "##b", "##c"]
        ids = [0, 10, 20]
        init = lookup_ops.KeyValueTensorInitializer(vocab,
                                                    ids,
                                                    key_dtype=dtypes.string,
                                                    value_dtype=dtypes.int64)
        table = lookup_ops.StaticVocabularyTableV1(
            init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)
        self.evaluate(table.initializer)

        tokenizer = WordpieceTokenizer(table)
        words = ragged_factory_ops.constant([["abb", "abc"], ["abcbc"]])
        subwords_ids = tokenizer.tokenize(words)

        with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
                                    "detokenize.*?dense on the interval"):
            result = tokenizer.detokenize(subwords_ids)
            self.evaluate(result)
Пример #6
0
    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None):
        if isinstance(vocab_lookup_table, str):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace,
                                               normalization_form)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)
Пример #7
0
 def _create_table(self, vocab, num_oov=100):
     init = lookup_ops.TextFileIdTableInitializer(vocab)
     return lookup_ops.StaticVocabularyTableV1(init, num_oov)