def test_lowercase_empty_string(self): txt = [ "", ] expected = [ "", ] self.assertAllEqual(expected, text.case_fold_utf8(txt))
def test_lowercase_one_string(self): txt = [ " TExt to loWERcase! ", ] expected = [ " text to lowercase! ", ] self.assertAllEqual(expected, text.case_fold_utf8(txt))
def preprocess(text): """Normalize the text, and return tokens.""" assert len(text.get_shape().as_list()) == 2 assert text.get_shape().as_list()[-1] == 1 text = tf.reshape(text, [-1]) text = tf_text.case_fold_utf8(text) tokenizer = tflite_text_api.WhitespaceTokenizer() return tokenizer.tokenize(text)
def test_lowercase_text(self): txt = [ "Punctuation and digits: -*/+$#%@%$123456789#^$*%&", "Non-latin UTF8 chars: ΘͽʦȺЩ", "Accented chars: ĎÔPQRŔSŠoóôpqrŕsštťuúvwxyý", "Non-UTF8-letters: e.g. ◆, ♥, and the emoji symbol ( ͡° ͜ʖ ͡°)", "Folded: ßς", "" ] expected = [ "punctuation and digits: -*/+$#%@%$123456789#^$*%&", "non-latin utf8 chars: θͽʦⱥщ", "accented chars: ďôpqrŕsšoóôpqrŕsštťuúvwxyý", "non-utf8-letters: e.g. ◆, ♥, and the emoji symbol ( ͡° ͜ʖ ͡°)", "folded: ssσ", "" ] self.assertAllEqual(expected, text.case_fold_utf8(txt))
def basic_tokenize(text_input, lower_case=False, keep_whitespace=False): """Performs basic word tokenization for BERT. Args: text_input: A Tensor of untokenized strings. lower_case: A bool indicating whether or not to perform lowercasing. Default is False. keep_whitespace: A bool indicating whether or not whitespace tokens should be kept in the output """ # lowercase and strip accents (if option is set) if lower_case: text_input = tf_text.case_fold_utf8(text_input) # normalize by NFD text_input = tf_text.normalize_utf8(text_input, "NFD") # strip out control characters text_input = tf.strings.regex_replace(text_input, r"\p{Cc}|\p{Cf}|\p{Mn}", "") # For chinese and emoji characters, tokenize by unicode codepoints script_tokenized = tf_text.unicode_script_tokenize( text_input, keep_whitespace=keep_whitespace, name="UTF-8") token_script_ids = tf.strings.unicode_script( tf.strings.unicode_decode(script_tokenized.flat_values, "UTF-8")) is_chinese = tf.equal(token_script_ids, _CHINESE_SCRIPT_ID)[:, :1].values is_emoji = tf_text.wordshape(script_tokenized.flat_values, tf_text.WordShape.HAS_EMOJI) is_punct = tf_text.wordshape(script_tokenized.flat_values, tf_text.WordShape.IS_PUNCT_OR_SYMBOL) split_cond = is_chinese | is_emoji | is_punct unicode_char_split = tf.strings.unicode_split(script_tokenized, "UTF-8") unicode_split_tokens = tf.where(split_cond, y=tf.expand_dims( script_tokenized.flat_values, 1), x=unicode_char_split.values) # Pack back into a [batch, (num_tokens), (num_unicode_chars)] RT chinese_mix_tokenized = tf.RaggedTensor.from_row_lengths( values=unicode_split_tokens, row_lengths=script_tokenized.row_lengths()) # Squeeze out to a [batch, (num_tokens)] RT return collapse_dims(chinese_mix_tokenized)
def call(self, inputs: tf.Tensor): """Calls `text.SentencepieceTokenizer` on inputs. Args: inputs: A string Tensor of shape `(batch_size,)`. Returns: One or three of RaggedTensors if tokenize_with_offsets is False or True, respectively. These are tokens: A RaggedTensor of shape `[batch_size, (pieces)]` and type `int32`. `tokens[i,j]` contains the j-th piece in the i-th input. start_offsets, limit_offsets: If `tokenize_with_offsets` is True, RaggedTensors of type `int64` with the same indices as tokens. Element `[i,j]` contains the byte offset at the start, or past the end, resp., for the j-th piece in the i-th input. """ if self._strip_diacritics: if self.tokenize_with_offsets: raise ValueError( "`tokenize_with_offsets` is not supported yet when " "`strip_diacritics` is set to True (b/181866850).") inputs = text.normalize_utf8(inputs, "NFD") inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "") if self._lower_case: inputs = text.case_fold_utf8(inputs) # Prepare to reshape the result to work around broken shape inference. batch_size = tf.shape(inputs)[0] def _reshape(rt): values = rt.values row_splits = rt.row_splits row_splits = tf.reshape(row_splits, [batch_size + 1]) return tf.RaggedTensor.from_row_splits(values, row_splits) # Call the tokenizer. if self.tokenize_with_offsets: tokens, start_offsets, limit_offsets = ( self._tokenizer.tokenize_with_offsets(inputs)) return _reshape(tokens), _reshape(start_offsets), _reshape( limit_offsets) else: tokens = self._tokenizer.tokenize(inputs) return _reshape(tokens)
def __call__(self, x): # Constrained sequence cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0]]) cs_input = np.array([cs_scores, cs_scores, cs_scores], dtype=np.float32) cs_transition_weights = np.array( [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0], [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0], [0.0, 1.0, 2.0, 3.0, 0.0]], dtype=np.float32) cs_allowed_transitions = np.array([[True, True, True, True, True], [True, True, True, True, True], [True, False, True, False, False], [True, True, True, True, True], [True, False, True, True, True]]) constrained_sequence = text.viterbi_constrained_sequence( cs_input, [2, 2, 2], allowed_transitions=cs_allowed_transitions, transition_weights=cs_transition_weights, use_log_space=True, use_start_and_end_states=True) # Max Spanning Tree mst_num_nodes = tf.constant([4, 3], tf.int32) mst_scores = tf.constant( [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]], [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]], tf.int32) # pyformat: disable (max_spanning_tree, _) = text.max_spanning_tree(mst_num_nodes, mst_scores) # Normalize normalized = text.case_fold_utf8(['A String']) normalized = text.normalize_utf8(normalized) # Regex split regex_split = text.regex_split(input=['Yo dawg!'], delim_regex_pattern=r'\s') # Rouge-L rl_hypotheses = tf.ragged.constant( [['captain', 'of', 'the', 'delta', 'flight'], ['the', '1990', 'transcript']]) rl_references = tf.ragged.constant( [['delta', 'air', 'lines', 'flight'], ['this', 'concludes', 'the', 'transcript']]) (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references) # Sentence breaking version 1 (token dependent) sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'], ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']] sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]] sb_token_starts = [] sb_token_ends = [] for sentence in sb_token_word: sentence_string = '' sentence_start = [] sentence_end = [] for word in sentence: sentence_start.append(len(sentence_string)) sentence_string = sentence_string.join([word, ' ']) sentence_end.append(len(sentence_string)) sb_token_starts.append(sentence_start) sb_token_ends.append(sentence_end) sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64) sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64) sb_token_properties = tf.ragged.constant(sb_token_properties, dtype=tf.int64) (sentence_breaking, _, _, _) = text.sentence_fragments(sb_token_word, sb_token_starts, sb_token_ends, sb_token_properties) # Sentence breaking version 2 (StateBasedSentenceBreaker) sbv2_text_input = [['Welcome to the U.S.! Harry'], ['Wu Tang Clan; ain\'t nothing']] sentence_breaker_v2 = text.StateBasedSentenceBreaker() sbv2_fragment_text, _, _ = ( sentence_breaker_v2.break_sentences_with_offsets(sbv2_text_input)) # Sentencepiece tokenizer sp_model_file = ( 'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model' ) sp_model = open(sp_model_file, 'rb').read() sp_tokenizer = text.SentencepieceTokenizer(sp_model) sentencepiece = sp_tokenizer.tokenize(['A sentence of things.']) sentencepiece = sp_tokenizer.detokenize(sentencepiece) (sentencepiece, _, _) = sp_tokenizer.tokenize_with_offsets(sentencepiece) sentencepiece_size = sp_tokenizer.vocab_size() sentencepiece_id = sp_tokenizer.id_to_string(1) # Split merge tokenizer sm_tokenizer = text.SplitMergeTokenizer() split_merge = sm_tokenizer.tokenize(b'IloveFlume!', [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0]) # Split merge from logits tokenizer smfl_tokenizer = text.SplitMergeFromLogitsTokenizer() split_merge_from_logits = smfl_tokenizer.tokenize( b'IloveFlume!', # One pair of logits for each Unicode character from the text. Each # pair indicates a "split" action if the first component is greater than # the second one, and a "merge" otherwise. [ [2.7, -0.3], # I: split [4.1, 0.82], # l: split [-2.3, 4.3], # o: merge [3.1, 12.2], # v: merge [-3.0, 4.7], # e: merge [2.7, -0.7], # F: split [0.7, 15.0], # l: merge [1.6, 23.0], # u: merge [2.1, 11.0], # m: merge [0.0, 20.0], # e: merge [18.0, 0.7], # !: split ]) # Confirm TF unicode_script op that requires ICU works tf_unicode_script = tf.strings.unicode_script( [ord('a'), 0x0411, 0x82b8, ord(',')]) # Unicode script tokenizer us_tokenizer = text.UnicodeScriptTokenizer() unicode_script = us_tokenizer.tokenize(['a string']) # Whitespace tokenizer ws_tokenizer = text.WhitespaceTokenizer() whitespace = ws_tokenizer.tokenize(['a string']) # Wordpiece tokenizer wp_initializer = tf.lookup.KeyValueTensorInitializer( ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64) self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer, default_value=-1) wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table) wordpiece = wp_tokenizer.tokenize(['i am']) # Wordshape wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')], text.WordShape.HAS_PUNCTUATION_DASH) # Assertion method def assert_check(tensor): return tf.assert_equal(tensor, tf.identity(tensor)) # Assertions constrained_sequence_assert = assert_check( constrained_sequence.to_tensor()) max_spanning_tree_assert = assert_check(max_spanning_tree) normalized_assert = assert_check(normalized) regex_split_assert = assert_check(regex_split.to_tensor()) rouge_l_assert = assert_check(rouge_l) sentence_breaking_assert = assert_check(sentence_breaking.to_tensor()) sentence_breaking_v2_assert = assert_check( sbv2_fragment_text.to_tensor()) sentencepiece_assert = assert_check(sentencepiece.to_tensor()) sentencepiece_id_assert = assert_check(sentencepiece_id) sentencepiece_size_assert = assert_check(sentencepiece_size) split_merge_assert = assert_check(split_merge) split_merge_from_logits_assert = assert_check(split_merge_from_logits) tf_unicode_script_assert = assert_check(tf_unicode_script) unicode_script_assert = assert_check(unicode_script.to_tensor()) whitespace_assert = assert_check(whitespace.to_tensor()) wordpiece_assert = assert_check(wordpiece.to_tensor()) wordshapes_assert = assert_check(wordshapes) with tf.control_dependencies([ constrained_sequence_assert, max_spanning_tree_assert, normalized_assert, regex_split_assert, rouge_l_assert, sentence_breaking_assert, sentence_breaking_v2_assert, sentencepiece_assert, sentencepiece_id_assert, sentencepiece_size_assert, split_merge_assert, split_merge_from_logits_assert, tf_unicode_script_assert, unicode_script_assert, whitespace_assert, wordpiece_assert, wordshapes_assert ]): y = tf.add(x, [1]) return {'y': y}
def _do_lower_case(t): t = tf_text.case_fold_utf8(t) t = tf_text.normalize_utf8(t, "NFD") t = tf.regex_replace(t, r"\p{Mn}", "") return t
def preprocess_text(text, label): standardized = tf_text.case_fold_utf8(text) tokenized = tokenizer.tokenize(standardized) vectorized = vocab_table.lookup(tokenized) return vectorized, label
def tokenize(text, unused_label): lower_case = tf_text.case_fold_utf8(text) return tokenizer.tokenize(lower_case)
def __call__(self, x): # Constrained sequence cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0]]) cs_input = np.array([cs_scores, cs_scores, cs_scores], dtype=np.float32) cs_transition_weights = np.array( [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0], [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0], [0.0, 1.0, 2.0, 3.0, 0.0]], dtype=np.float32) cs_allowed_transitions = np.array([[True, True, True, True, True], [True, True, True, True, True], [True, False, True, False, False], [True, True, True, True, True], [True, False, True, True, True]]) constrained_sequence = text.viterbi_constrained_sequence( cs_input, [2, 2, 2], allowed_transitions=cs_allowed_transitions, transition_weights=cs_transition_weights, use_log_space=True, use_start_and_end_states=True) # Max Spanning Tree mst_num_nodes = tf.constant([4, 3], tf.int32) mst_scores = tf.constant( [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]], [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]], tf.int32) # pyformat: disable (max_spanning_tree, _) = text.max_spanning_tree(mst_num_nodes, mst_scores) # Normalize normalized = text.case_fold_utf8(['A String']) normalized = text.normalize_utf8(normalized) # Regex split regex_split = text.regex_split(input=['Yo dawg!'], delim_regex_pattern=r'\s') # Rouge-L rl_hypotheses = tf.ragged.constant( [['captain', 'of', 'the', 'delta', 'flight'], ['the', '1990', 'transcript']]) rl_references = tf.ragged.constant( [['delta', 'air', 'lines', 'flight'], ['this', 'concludes', 'the', 'transcript']]) (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references) # Sentence breaking sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'], ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']] sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]] sb_token_starts = [] sb_token_ends = [] for sentence in sb_token_word: sentence_string = '' sentence_start = [] sentence_end = [] for word in sentence: sentence_start.append(len(sentence_string)) sentence_string = sentence_string.join([word, ' ']) sentence_end.append(len(sentence_string)) sb_token_starts.append(sentence_start) sb_token_ends.append(sentence_end) sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64) sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64) sb_token_properties = tf.ragged.constant(sb_token_properties, dtype=tf.int64) (sentence_breaking, _, _, _) = text.sentence_fragments(sb_token_word, sb_token_starts, sb_token_ends, sb_token_properties) # Sentencepiece tokenizer sp_model_file = ( 'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model' ) sp_model = open(sp_model_file, 'rb').read() sp_tokenizer = text.SentencepieceTokenizer(sp_model) sentencepiece = sp_tokenizer.tokenize(['A sentence of things.']) sentencepiece = sp_tokenizer.detokenize(sentencepiece) (sentencepiece, _, _) = sp_tokenizer.tokenize_with_offsets(sentencepiece) sentencepiece_size = sp_tokenizer.vocab_size() sentencepiece_id = sp_tokenizer.id_to_string(1) # Split merge tokenizer - not in this version sm_tokenizer = text.SplitMergeTokenizer() split_merge = sm_tokenizer.tokenize(b'IloveFlume!', [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0]) # Unicode script tokenizer us_tokenizer = text.UnicodeScriptTokenizer() unicode_script = us_tokenizer.tokenize(['a string']) # Whitespace tokenizer ws_tokenizer = text.WhitespaceTokenizer() whitespace = ws_tokenizer.tokenize(['a string']) # Wordpiece tokenizer wp_initializer = tf.lookup.KeyValueTensorInitializer( ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64) self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer, default_value=-1) wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table) wordpiece = wp_tokenizer.tokenize(['i am']) # Wordshape wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')], text.WordShape.HAS_PUNCTUATION_DASH) with tf.control_dependencies([ constrained_sequence, max_spanning_tree, normalized, regex_split, rouge_l, sentence_breaking, sentencepiece, sentencepiece_id, sentencepiece_size, split_merge, unicode_script, whitespace, wordpiece, wordshapes ]): y = tf.add(x, [1]) return {'y': y}
def test_lowercase_one_string_ragged(self): txt = ragged_factory_ops.constant([[" TExt ", "to", " loWERcase! "], [" TExt to loWERcase! "]]) expected = [[" text ", "to", " lowercase! "], [" text to lowercase! "]] self.assertRaggedEqual(expected, text.case_fold_utf8(txt))
def preprocess(text): """Normalize the text, and return tokens.""" text = tf.reshape(text, [-1]) text = tf_text.case_fold_utf8(text) tokenizer = tflite_text_api.WhitespaceTokenizer() return tokenizer.tokenize(text)
def unpaired_tokenize(self, texts): if self.do_lower_case: texts = case_fold_utf8(texts) return self.tf_tokenizer.tokenize(texts)