def __init__(self, tfds_name: str = 'glue/sst2', vocab_path: str = 'vocab.txt', tokenizer: text.Tokenizer = text.WhitespaceTokenizer(), split='train'): """Initializes the SST2 data source.""" self.dataset, self.info = tfds.load(tfds_name, split=split, with_info=True) # Look up the feature name of the text and label in the dataset. # We assume there is one text input and one label. text_fields = filter(_is_text_field, self.info.features.items()) label_fields = filter(_is_class_label, self.info.features.items()) self.text_feature_name, _ = next(text_fields) self.label_feature_name, _ = next(label_fields) # Load the vocabulary. self.vocab = vocabulary.Vocabulary(vocab_path=vocab_path) # Convert the sentences to sequences of token IDs and compute length. self.tokenizer = tokenizer self.tf_vocab = vocab_to_hashtable(self.vocab, unk_idx=self.vocab.unk_idx) self.examples = self.dataset.map(self.prepare_example, num_parallel_calls=AUTOTUNE).cache()
def _parse_tfrecord_function(example, lookup_table): example_fmt = { 'opcodes': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64) } parsed = tf.io.parse_single_example(example, example_fmt) tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize(parsed['opcodes']) IDs = lookup_table.lookup(tokens) return IDs, parsed['label']
def get_tokenized_sequences( dataset: tf.data.Dataset, tokenizer: tftext.Tokenizer = tftext.WhitespaceTokenizer(), input_key: str = 'sentence') -> Iterable[Sequence[bytes]]: """Returns tokenized sequences for vocabulary building.""" dataset = dataset.map( lambda example: tokenizer.tokenize(example[input_key]), num_parallel_calls=tf.data.experimental.AUTOTUNE) for sentence in tfds.as_numpy(dataset): yield sentence
def _parse_tfrecord_function(example, opcodes_lookup_table, bytes_lookup_table): example_fmt = { 'mnemonics': tf.io.FixedLenFeature([], tf.string), 'bytes': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64) } parsed = tf.io.parse_single_example(example, example_fmt) tokenizer = text.WhitespaceTokenizer() opcode_tokens = tokenizer.tokenize(parsed['mnemonics']) byte_tokens = tokenizer.tokenize(parsed['bytes']) opcode_IDs = opcodes_lookup_table.lookup(opcode_tokens) byte_IDs = bytes_lookup_table.lookup(byte_tokens) return opcode_IDs, byte_IDs, parsed['label']
def make_data(sentences, window_size): tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize(sentences) ngrams = text.ngrams(tokens, window_size + 1, reduction_type=text.Reduction.STRING_JOIN) segments = np.array( [x[0].decode("UTF-8").split(" ") for x in ngrams.to_list()]) input_batch = [' '.join(x) for x in segments[:, 0:-1]] target_batch = to_categorical(np.vectorize(lambda x: word_index[x] - 1)( segments[:, -1]), n_class, dtype='float32') return input_batch, target_batch
def call(self, strings: tf.Tensor, training=False, **kwargs) -> tf.Tensor: """Convert a tensor of strings into a tensor of token ids. Uses the WordpieceTokenizer to convert a tensor of shape N to N+1. The added dimension is the 'sentence' dimension when the string gets converted into split up tokens. e.g. ["What time is it.", "it's 3 o'clock"] -> [ [2, 3, 4, 5, 0, 0], [3, 5, 7, 1, 8, 0] ] Parameters ---------- strings: A tensor of strings to be tokenized. Returns ------- The tensor of tokenized strings. """ input_shape = strings.shape output_shape = input_shape.concatenate( tf.TensorShape([self.max_seq_length])) # Define the tokenizers and tokenize the strings. self.whitespace_tokenizer = tf_text.WhitespaceTokenizer() self.tokenizer = tf_text.WordpieceTokenizer( self.vocab_table, token_out_type=tf.int64 ) tokens = self.whitespace_tokenizer.tokenize('[SEP] ' + strings) tokens = self.tokenizer.tokenize(tokens) # Collapse the ragged tensor dimension by one convert to a regular tensor. tokens = self._merge_dims(tokens, -2) tokens = tokens.to_tensor(default_value=0) rank = len(tokens.shape) # Slice off some of the dim if it's too long or pad if it's too short. tokens = tokens[..., :self.max_seq_length] seq_len = tf.shape(tokens)[-1] paddings = [[0, 0]] * (rank - 1) + [[0, self.max_seq_length - seq_len]] tokens = tf.pad(tokens, paddings, 'CONSTANT', constant_values=0) tokens = tf.ensure_shape(tokens, output_shape) return tokens
def _parse_tfrecord_function(example, opcodes_lookup_table, bytes_lookup_table): example_fmt = { 'opcodes': tf.io.FixedLenFeature([], tf.string), 'bytes': tf.io.FixedLenFeature([], tf.string), 'APIs': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64) } parsed = tf.io.parse_single_example(example, example_fmt) tokenizer = text.WhitespaceTokenizer() opcodes_tokens = tokenizer.tokenize(parsed['opcodes']) opcodes_IDs = opcodes_lookup_table.lookup(opcodes_tokens) bytes_tokens = tokenizer.tokenize(parsed['bytes']) bytes_IDs = bytes_lookup_table.lookup(bytes_tokens) feature_vector = tf.io.decode_raw(parsed['APIs'], tf.float32) return opcodes_IDs, bytes_IDs, feature_vector, parsed['label']
def tokenize(ds, dataset_name): """Tokenizes a line into words with alphanum characters.""" def extract_strings(example): if dataset_name == 'shakespeare': return tf.expand_dims(example['snippets'], 0) elif dataset_name == 'stackoverflow': return tf.expand_dims(example['tokens'], 0) else: raise app.UsageError('Dataset not supported: ', dataset_name) def tokenize_line(line): return tf.data.Dataset.from_tensor_slices(tokenizer.tokenize(line)[0]) def mask_all_symbolic_words(word): return tf.math.logical_not( tf_text.wordshape(word, tf_text.WordShape.IS_PUNCT_OR_SYMBOL)) tokenizer = tf_text.WhitespaceTokenizer() ds = ds.map(extract_strings) ds = ds.flat_map(tokenize_line) ds = ds.map(tf_text.case_fold_utf8) ds = ds.filter(mask_all_symbolic_words) return ds
def __call__(self, x): # Constrained sequence cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0]]) cs_input = np.array([cs_scores, cs_scores, cs_scores], dtype=np.float32) cs_transition_weights = np.array( [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0], [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0], [0.0, 1.0, 2.0, 3.0, 0.0]], dtype=np.float32) cs_allowed_transitions = np.array([[True, True, True, True, True], [True, True, True, True, True], [True, False, True, False, False], [True, True, True, True, True], [True, False, True, True, True]]) constrained_sequence = text.viterbi_constrained_sequence( cs_input, [2, 2, 2], allowed_transitions=cs_allowed_transitions, transition_weights=cs_transition_weights, use_log_space=True, use_start_and_end_states=True) # Max Spanning Tree mst_num_nodes = tf.constant([4, 3], tf.int32) mst_scores = tf.constant( [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]], [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]], tf.int32) # pyformat: disable (max_spanning_tree, _) = text.max_spanning_tree(mst_num_nodes, mst_scores) # Normalize normalized = text.case_fold_utf8(['A String']) normalized = text.normalize_utf8(normalized) # Regex split regex_split = text.regex_split(input=['Yo dawg!'], delim_regex_pattern=r'\s') # Rouge-L rl_hypotheses = tf.ragged.constant( [['captain', 'of', 'the', 'delta', 'flight'], ['the', '1990', 'transcript']]) rl_references = tf.ragged.constant( [['delta', 'air', 'lines', 'flight'], ['this', 'concludes', 'the', 'transcript']]) (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references) # Sentence breaking version 1 (token dependent) sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'], ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']] sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]] sb_token_starts = [] sb_token_ends = [] for sentence in sb_token_word: sentence_string = '' sentence_start = [] sentence_end = [] for word in sentence: sentence_start.append(len(sentence_string)) sentence_string = sentence_string.join([word, ' ']) sentence_end.append(len(sentence_string)) sb_token_starts.append(sentence_start) sb_token_ends.append(sentence_end) sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64) sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64) sb_token_properties = tf.ragged.constant(sb_token_properties, dtype=tf.int64) (sentence_breaking, _, _, _) = text.sentence_fragments(sb_token_word, sb_token_starts, sb_token_ends, sb_token_properties) # Sentence breaking version 2 (StateBasedSentenceBreaker) sbv2_text_input = [['Welcome to the U.S.! Harry'], ['Wu Tang Clan; ain\'t nothing']] sentence_breaker_v2 = text.StateBasedSentenceBreaker() sbv2_fragment_text, _, _ = ( sentence_breaker_v2.break_sentences_with_offsets(sbv2_text_input)) # Sentencepiece tokenizer sp_model_file = ( 'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model' ) sp_model = open(sp_model_file, 'rb').read() sp_tokenizer = text.SentencepieceTokenizer(sp_model) sentencepiece = sp_tokenizer.tokenize(['A sentence of things.']) sentencepiece = sp_tokenizer.detokenize(sentencepiece) (sentencepiece, _, _) = sp_tokenizer.tokenize_with_offsets(sentencepiece) sentencepiece_size = sp_tokenizer.vocab_size() sentencepiece_id = sp_tokenizer.id_to_string(1) # Split merge tokenizer sm_tokenizer = text.SplitMergeTokenizer() split_merge = sm_tokenizer.tokenize(b'IloveFlume!', [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0]) # Split merge from logits tokenizer smfl_tokenizer = text.SplitMergeFromLogitsTokenizer() split_merge_from_logits = smfl_tokenizer.tokenize( b'IloveFlume!', # One pair of logits for each Unicode character from the text. Each # pair indicates a "split" action if the first component is greater than # the second one, and a "merge" otherwise. [ [2.7, -0.3], # I: split [4.1, 0.82], # l: split [-2.3, 4.3], # o: merge [3.1, 12.2], # v: merge [-3.0, 4.7], # e: merge [2.7, -0.7], # F: split [0.7, 15.0], # l: merge [1.6, 23.0], # u: merge [2.1, 11.0], # m: merge [0.0, 20.0], # e: merge [18.0, 0.7], # !: split ]) # Confirm TF unicode_script op that requires ICU works tf_unicode_script = tf.strings.unicode_script( [ord('a'), 0x0411, 0x82b8, ord(',')]) # Unicode script tokenizer us_tokenizer = text.UnicodeScriptTokenizer() unicode_script = us_tokenizer.tokenize(['a string']) # Whitespace tokenizer ws_tokenizer = text.WhitespaceTokenizer() whitespace = ws_tokenizer.tokenize(['a string']) # Wordpiece tokenizer wp_initializer = tf.lookup.KeyValueTensorInitializer( ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64) self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer, default_value=-1) wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table) wordpiece = wp_tokenizer.tokenize(['i am']) # Wordshape wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')], text.WordShape.HAS_PUNCTUATION_DASH) # Assertion method def assert_check(tensor): return tf.assert_equal(tensor, tf.identity(tensor)) # Assertions constrained_sequence_assert = assert_check( constrained_sequence.to_tensor()) max_spanning_tree_assert = assert_check(max_spanning_tree) normalized_assert = assert_check(normalized) regex_split_assert = assert_check(regex_split.to_tensor()) rouge_l_assert = assert_check(rouge_l) sentence_breaking_assert = assert_check(sentence_breaking.to_tensor()) sentence_breaking_v2_assert = assert_check( sbv2_fragment_text.to_tensor()) sentencepiece_assert = assert_check(sentencepiece.to_tensor()) sentencepiece_id_assert = assert_check(sentencepiece_id) sentencepiece_size_assert = assert_check(sentencepiece_size) split_merge_assert = assert_check(split_merge) split_merge_from_logits_assert = assert_check(split_merge_from_logits) tf_unicode_script_assert = assert_check(tf_unicode_script) unicode_script_assert = assert_check(unicode_script.to_tensor()) whitespace_assert = assert_check(whitespace.to_tensor()) wordpiece_assert = assert_check(wordpiece.to_tensor()) wordshapes_assert = assert_check(wordshapes) with tf.control_dependencies([ constrained_sequence_assert, max_spanning_tree_assert, normalized_assert, regex_split_assert, rouge_l_assert, sentence_breaking_assert, sentence_breaking_v2_assert, sentencepiece_assert, sentencepiece_id_assert, sentencepiece_size_assert, split_merge_assert, split_merge_from_logits_assert, tf_unicode_script_assert, unicode_script_assert, whitespace_assert, wordpiece_assert, wordshapes_assert ]): y = tf.add(x, [1]) return {'y': y}
def tokenize_fun(tokenizer): """Standard text processing function.""" wsp = text.WhitespaceTokenizer() return utils.compose(tokenizer.tokenize, wsp.tokenize, text.case_fold_utf8)
def main(): # Unicode docs = tf.constant([ u'Everything not saved will be lost.'.encode('UTF-16-BE'), u'Sad☹'.encode('UTF-16-BE') ]) _ = tf.strings.unicode_transcode(docs, input_encoding='UTF-16-BE', output_encoding='UTF-8') # Tokenization # WhitespaceTokenizer tokenizer = text.UnicodeScriptTokenizer() tokens = tokenizer.tokenize( ['everything not saved will be lost', u'Sad☹'.encode('UTF-8')]) print(f'Tokens: {tokens.to_list()}') # Unicode split tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8') print(f'Tokens: {tokens.to_list()}') # Offsets tokenizer = text.UnicodeScriptTokenizer() (tokens, _, end_offsets) = tokenizer.tokenize_with_offsets( ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(f'Tokens: {tokens.to_list()}') print(f'Offsets: {end_offsets.to_list()}') # TF.Data Example docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]]) tokenizer = text.WhitespaceTokenizer() tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x)) iterator = iter(tokenized_docs) print(f'First sentence tokens: {next(iterator).to_list()}') print(f'Seconds sentence tokens: {next(iterator).to_list()}') # Other Text Ops # Wordshape tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) # Is capitalized? f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE) # Are all letters uppercased f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE) # Does the token contain punctuation? f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL) # Is the token a number? f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE) print(f'Is capitalized? {f1.to_list()}') print(f'Are all letters uppercased? {f2.to_list()}') print(f'Does the token contain punctuation? {f3.to_list()}') print(f'Is the token a number? {f4.to_list()}') # N-grams & Sliding Window tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) # Ngrams, in this case bi-gram (n = 2) bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN) print(f'Bi-grams: {bigrams.to_list()}')
def __call__(self, x): # Constrained sequence cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0]]) cs_input = np.array([cs_scores, cs_scores, cs_scores], dtype=np.float32) cs_transition_weights = np.array( [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0], [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0], [0.0, 1.0, 2.0, 3.0, 0.0]], dtype=np.float32) cs_allowed_transitions = np.array([[True, True, True, True, True], [True, True, True, True, True], [True, False, True, False, False], [True, True, True, True, True], [True, False, True, True, True]]) constrained_sequence = text.viterbi_constrained_sequence( cs_input, [2, 2, 2], allowed_transitions=cs_allowed_transitions, transition_weights=cs_transition_weights, use_log_space=True, use_start_and_end_states=True) # Max Spanning Tree mst_num_nodes = tf.constant([4, 3], tf.int32) mst_scores = tf.constant( [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]], [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]], tf.int32) # pyformat: disable (max_spanning_tree, _) = text.max_spanning_tree(mst_num_nodes, mst_scores) # Normalize normalized = text.case_fold_utf8(['A String']) normalized = text.normalize_utf8(normalized) # Regex split regex_split = text.regex_split(input=['Yo dawg!'], delim_regex_pattern=r'\s') # Rouge-L rl_hypotheses = tf.ragged.constant( [['captain', 'of', 'the', 'delta', 'flight'], ['the', '1990', 'transcript']]) rl_references = tf.ragged.constant( [['delta', 'air', 'lines', 'flight'], ['this', 'concludes', 'the', 'transcript']]) (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references) # Sentence breaking sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'], ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']] sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]] sb_token_starts = [] sb_token_ends = [] for sentence in sb_token_word: sentence_string = '' sentence_start = [] sentence_end = [] for word in sentence: sentence_start.append(len(sentence_string)) sentence_string = sentence_string.join([word, ' ']) sentence_end.append(len(sentence_string)) sb_token_starts.append(sentence_start) sb_token_ends.append(sentence_end) sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64) sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64) sb_token_properties = tf.ragged.constant(sb_token_properties, dtype=tf.int64) (sentence_breaking, _, _, _) = text.sentence_fragments(sb_token_word, sb_token_starts, sb_token_ends, sb_token_properties) # Sentencepiece tokenizer sp_model_file = ( 'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model' ) sp_model = open(sp_model_file, 'rb').read() sp_tokenizer = text.SentencepieceTokenizer(sp_model) sentencepiece = sp_tokenizer.tokenize(['A sentence of things.']) sentencepiece = sp_tokenizer.detokenize(sentencepiece) (sentencepiece, _, _) = sp_tokenizer.tokenize_with_offsets(sentencepiece) sentencepiece_size = sp_tokenizer.vocab_size() sentencepiece_id = sp_tokenizer.id_to_string(1) # Split merge tokenizer - not in this version sm_tokenizer = text.SplitMergeTokenizer() split_merge = sm_tokenizer.tokenize(b'IloveFlume!', [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0]) # Unicode script tokenizer us_tokenizer = text.UnicodeScriptTokenizer() unicode_script = us_tokenizer.tokenize(['a string']) # Whitespace tokenizer ws_tokenizer = text.WhitespaceTokenizer() whitespace = ws_tokenizer.tokenize(['a string']) # Wordpiece tokenizer wp_initializer = tf.lookup.KeyValueTensorInitializer( ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64) self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer, default_value=-1) wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table) wordpiece = wp_tokenizer.tokenize(['i am']) # Wordshape wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')], text.WordShape.HAS_PUNCTUATION_DASH) with tf.control_dependencies([ constrained_sequence, max_spanning_tree, normalized, regex_split, rouge_l, sentence_breaking, sentencepiece, sentencepiece_id, sentencepiece_size, split_merge, unicode_script, whitespace, wordpiece, wordshapes ]): y = tf.add(x, [1]) return {'y': y}
def get_datasets(n_devices, task_name, data_dir=None, batch_size=256, max_length=2000): """Get algorithmic datasets.""" if batch_size % n_devices: raise ValueError("Batch size %d isn't divided evenly by n_devices %d" % (batch_size, n_devices)) train_path = data_dir + task_name + '_train.tsv' val_path = data_dir + task_name + '_val.tsv' test_path = data_dir + task_name + '_test.tsv' train_dataset = preprocess_dataset(train_path, batch_size) val_dataset = preprocess_dataset(val_path, batch_size) test_dataset = preprocess_dataset(test_path, batch_size) tf.logging.info('Finished preprocessing') tf.logging.info('Building vocab') # build vocab vocab_set = set() tokenizer = text.WhitespaceTokenizer() lengths = [] for i, data in enumerate(val_dataset): examples = data['Source'] examples = tokenizer.tokenize(examples.numpy()) examples = np.reshape(examples, (-1)).tolist() lengths.append(len(examples)) vocab_set.update(examples) if i % 1000 == 0: tf.logging.info('Processed {}'.format(i)) if i > 1000: break vocab_set = list(set(vocab_set)) tf.logging.info('Finished processing vocab size={}'.format(len(vocab_set))) encoder = tfds.deprecated.text.TokenTextEncoder(vocab_set) def tf_encode(x): result = tf.py_function( lambda s: tf.constant(encoder.encode(s.numpy())), [ x, ], tf.int32) result.set_shape([None]) return result def tokenize(d): return { 'inputs': tf_encode(d['Source'])[:max_length], 'targets': d['Target'] } train_dataset = train_dataset.map(tokenize, num_parallel_calls=AUTOTUNE) val_dataset = val_dataset.map(tokenize, num_parallel_calls=AUTOTUNE) test_dataset = test_dataset.map(tokenize, num_parallel_calls=AUTOTUNE) max_shape = {'inputs': [max_length], 'targets': []} train_dataset = train_dataset.shuffle( buffer_size=1024, reshuffle_each_iteration=True).padded_batch(batch_size, padded_shapes=max_shape) val_dataset = val_dataset.padded_batch(batch_size, padded_shapes=max_shape) test_dataset = test_dataset.padded_batch(batch_size, padded_shapes=max_shape) train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE) val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE) test_dataset = test_dataset.prefetch(tf.data.experimental.AUTOTUNE) return train_dataset, val_dataset, test_dataset, encoder
def _call_whitespace_tokenizer_to_ragged(test_case): tokenizer = tf_text.WhitespaceTokenizer() return tokenizer.tokenize(test_case)
def tokenize(dataset): tokenizer = text.WhitespaceTokenizer() return tokenizer.tokenize(dataset)
def __init__(self): super(WhitespaceTokenizer, self).__init__() self._tokenizer = tf_text.WhitespaceTokenizer()
# https://www.tensorflow.org/tutorials/tensorflow_text/intro import tensorflow as tf import tensorflow_text as text docs = tf.constant([ u'Everything not saved will be lost.'.encode('UTF-16-BE'), u'Sad☹'.encode('UTF-16-BE') ]) utf8_docs = tf.strings.unicode_transcode(docs, input_encoding='UTF-16-BE', output_encoding='UTF-8') tokenizer = text.WhitespaceTokenizer() tokens = tokenizer.tokenize( ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(tokens.to_list()) tokenizer = text.UnicodeScriptTokenizer() tokens = tokenizer.tokenize( ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(tokens.to_list()) tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8') print(tokens.to_list()) tokenizer = text.UnicodeScriptTokenizer() (tokens, offset_starts, offset_limits) = tokenizer.tokenize_with_offsets( ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')]) print(tokens.to_list())
def tokenize_w_punctuation(tokenizer): """Text processing function which splits off punctuation.""" wsp = text.WhitespaceTokenizer() return utils.compose(tokenizer.tokenize, wsp.tokenize, tensor_punctuation_separator, text.case_fold_utf8)