def _tokenize_example(document, summary): tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(vocab_model_file, "rb").read()) if substitute_newline: document = tf.strings.regex_replace(document, "\n", substitute_newline) # Remove space before special tokens. document = tf.strings.regex_replace(document, r" ([<\[]\S+[>\]])", b"\\1") document_ids = tokenizer.tokenize(document) if isinstance(document_ids, tf.RaggedTensor): document_ids = document_ids.to_tensor(0) document_ids = document_ids[:max_encoder_length] # Remove newline optionally if substitute_newline: summary = tf.strings.regex_replace(summary, "\n", substitute_newline) # Remove space before special tokens. summary = tf.strings.regex_replace(summary, r" ([<\[]\S+[>\]])", b"\\1") summary_ids = tokenizer.tokenize(summary) # Add [EOS] (1) special tokens. suffix = tf.constant([1]) summary_ids = tf.concat([summary_ids, suffix], axis=0) if isinstance(summary_ids, tf.RaggedTensor): summary_ids = summary_ids.to_tensor(0) summary_ids = summary_ids[:max_decoder_length] return document_ids, summary_ids
def input_fn(): # text input text = tf.compat.v1.placeholder(tf.string, [batch_size], name="input_text") # text tokenize tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(vocab_model_file, "rb").read()) if substitute_newline: text = tf.strings.regex_replace(text, "\n", substitute_newline) # Remove space before special tokens. text = tf.strings.regex_replace(text, r" ([<\[]\S+[>\]])", b"\\1") ids = tokenizer.tokenize(text) if isinstance(ids, tf.RaggedTensor): ids = ids.to_tensor(0) # text padding: Pad only if necessary and reshape properly padded_ids = dynamic_padding(ids, max_encoder_length) ids = tf.slice(padded_ids, [0, 0], [batch_size, max_encoder_length]) receiver_tensors = {"input": text} features = {"input_ids": tf.cast(ids, tf.int32, name="input_ids")} return tf.estimator.export.ServingInputReceiver( features=features, receiver_tensors=receiver_tensors)
def input_fn(): # text input text = tf.compat.v1.placeholder(tf.string, [batch_size], name="input_text") # text tokenize tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(vocab_model_file, "rb").read()) if substitute_newline: text = tf.strings.regex_replace(text, "\n", substitute_newline) ids = tokenizer.tokenize(text) ids = ids[:, :max_encoder_length - 2] # Add [CLS] and [SEP] special tokens. prefix = tf.repeat(tf.constant([[65]]), batch_size, axis=0) suffix = tf.repeat(tf.constant([[66]]), batch_size, axis=0) ids = tf.concat([prefix, ids, suffix], axis=1) if isinstance(ids, tf.RaggedTensor): ids = ids.to_tensor(0) # text padding: Pad only if necessary and reshape properly padded_ids = dynamic_padding(ids, max_encoder_length) ids = tf.slice(padded_ids, [0, 0], [batch_size, max_encoder_length]) receiver_tensors = {"input": text} features = {"input_ids": tf.cast(ids, tf.int32, name="input_ids")} return tf.estimator.export.ServingInputReceiver( features=features, receiver_tensors=receiver_tensors)
def _tokenize_example(context, question): tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(vocab_model_file, "rb").read()) if substitute_newline: context = tf.strings.regex_replace(context, "\n", substitute_newline) # Remove space before special tokens. context = tf.strings.regex_replace(context, r" ([<\[]\S+[>\]])", b"\\1") context_ids = tokenizer.tokenize(context) if isinstance(context_ids, tf.RaggedTensor): context_ids = context_ids.to_tensor(0) context_ids = context_ids[:max_encoder_length] # Remove newline optionally if substitute_newline: question = tf.strings.regex_replace(question, "\n", substitute_newline) # Remove space before special tokens. question = tf.strings.regex_replace(question, r" ([<\[]\S+[>\]])", b"\\1") question_ids = tokenizer.tokenize(question) # Add [EOS] (1) special tokens. suffix = tf.constant([1]) question_ids = tf.concat([question_ids, suffix], axis=0) if isinstance(question_ids, tf.RaggedTensor): question_ids = question_ids.to_tensor(0) question_ids = question_ids[:max_decoder_length] return context_ids, question_ids
def c4_preprocess(dataset, training, max_target_length=-1, tokenization=None, spm_path=None): """Pre-processing function for C4 dataset.""" del training def unicode_decode_chars(features, targets): targets = tf.strings.unicode_decode(features['text'], 'UTF-8') targets = tf.cast(targets, tf.int64) features['targets'] = targets features['inputs'] = targets return (features, targets) def spc_tokenize(tokenizer, features, targets): del targets tokenized_text = tokenizer.tokenize(features['text']) features['targets'] = tf.cast(tokenized_text, tf.int64) features['inputs'] = features['targets'] return features, features['targets'] if tokenization == 'spc': spm_path = spm_path or t5_utils.DEFAULT_SPM_PATH with tf.compat.v1.gfile.GFile(spm_path, 'rb') as f: spc_model = f.read() tokenizer = tf_text.SentencepieceTokenizer(model=spc_model) dataset = dataset.map(functools.partial(spc_tokenize, tokenizer)) else: dataset = dataset.map(unicode_decode_chars) def target_right_length(_, target): return tf.less(tf.shape(target)[0], max_target_length + 1) if max_target_length > 0: dataset = dataset.filter(target_right_length) return dataset
def do_masking(example): if "tfds://" == data_dir[:7]: text = example["text"] else: text = example print(text) tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(vocab_model_file, "rb").read()) if substitute_newline: text = tf.strings.regex_replace(text, "\n", substitute_newline) subtokens = tokenizer.tokenize(text) (subtokens, masked_lm_positions, masked_lm_ids, masked_lm_weights) = tf.compat.v1.py_func( numpy_masking, [subtokens], [tf.int32, tf.int32, tf.int32, tf.float32], stateful=False) features = { "input_ids": subtokens, "segment_ids": tf.zeros_like(subtokens), "masked_lm_positions": masked_lm_positions, "masked_lm_ids": masked_lm_ids, "masked_lm_weights": masked_lm_weights, "next_sentence_labels": tf.zeros([1], dtype=tf.int64), } return features
def setup(self): self.tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(self.vocab_model_file, "rb").read()) self.sentence_tokenizer = nltk.load(SENTENCE_TOKENIZER_PATH) self.delimiter_range_pair = rendering_utils.get_default_delimiter_range_pair( task=self.task, delimiter_type=self.delimiter_type, )
def load_sentencepiece_tokenizer( model_path, add_bos=False, add_eos=True, reverse=False): """Load a tf-text SentencePiece tokenizer from given model filepath.""" with tf.io.gfile.GFile(model_path, 'rb') as model_fp: sp_model = model_fp.read() sp_tokenizer = tftxt.SentencepieceTokenizer( model=sp_model, add_bos=add_bos, add_eos=add_eos, reverse=reverse) return sp_tokenizer
def __init__(self, params: WMTDataConfig): self._params = params self._max_seq_length = params.max_seq_length self._static_batch = params.static_batch self._global_batch_size = params.global_batch_size if self._params.transform_and_batch: self._tokenizer = tftxt.SentencepieceTokenizer( model=tf.io.gfile.GFile(params.sentencepiece_model_path, 'rb').read(), add_eos=True)
def __post_init__(self): tokenizer = tensorflow_text.SentencepieceTokenizer( model=tf.io.gfile.GFile(self.vocab_path, 'rb').read(), add_eos=True) eos_token = tokenizer.string_to_id('</s>') # Work-around for frozen dataclasses: # https://stackoverflow.com/questions/53756788 object.__setattr__(self, 'eos_token', eos_token) object.__setattr__(self, '_tokenizer', tokenizer)
def __init__(self, model=None, model_path=None): super(SentencepieceTokenizer, self).__init__() if model_path: self.proto = open(model_path, "rb").read() if model: if isinstance(model, str): asc_str = model.encode("ascii") self.proto = base64.decodebytes(asc_str) if isinstance(model, bytes): self.proto = model self.tokenizer = text.SentencepieceTokenizer(self.proto)
def __init__(self, params): super().__init__(params) p = self.params with tf.io.gfile.GFile(p.spm_model, 'rb') as f: self._tokenizer = tf_text.SentencepieceTokenizer( model=f.read(), out_type=tf.int32, nbest_size=p.nbest_size, alpha=p.alpha, reverse=False, add_bos=False, add_eos=p.append_eos)
def __init__(self, params, model: tf.keras.Model, inference_step=None): super().__init__(params, model, inference_step) self._sp_tokenizer = tf_text.SentencepieceTokenizer( model=tf.io.gfile.GFile(params.sentencepiece_model_path, "rb").read(), add_eos=True) try: empty_str_tokenized = self._sp_tokenizer.tokenize("").numpy() except tf.errors.InternalError: raise ValueError( "EOS token not in tokenizer vocab." "Please make sure the tokenizer generates a single token for an " "empty string.") self._eos_id = empty_str_tokenized.item()
def test_tftext_sentencepiece_tokenizer_bos_eos(self): """Check that the new tokenizer produces the same result that the tftext one with bos and eos.""" tftext_sp = tensorflow_text.SentencepieceTokenizer( self.sentencepiece_model, add_bos=True, add_eos=True) opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer( self.sentencepiece_model, add_bos=True, add_eos=True) input_text = [ u" ", u"to be or not to be", u"ignored by length text1", u"ignored by length text2" ] tftext_tokenized = tftext_sp.tokenize(input_text) opt_tokenized = opt_sp.tokenize(input_text) self.assertAllEqual(tftext_tokenized, opt_tokenized)
def decode(ids: tf.Tensor, vocab_filename: str, encoder_type: str): """DecodeOp.""" if encoder_type not in ["sentencepiece", "sentencepiece_newline"]: raise ValueError("Unsupported encoder type: %s" % encoder_type) sp_model = tf.gfile.GFile(vocab_filename, "rb").read() tokenizer = tf_text.SentencepieceTokenizer(model=sp_model) ids = tf.where(ids > 1 + _SHIFT_RESERVED_TOKENS, ids - _SHIFT_RESERVED_TOKENS, ids) ids = tf.cast(ids, tf.int32) text = tokenizer.detokenize(ids) text = tf.reshape(text, [-1]) if encoder_type == "sentencepiece_newline": text = tf.strings.regex_replace(text, _NEWLINE_SYMBOL, "\n") return text
def test_tftext_sentencepiece_detokenizer(self): """Check that the new tokenizer produces the same result that the tftext one.""" tftext_sp = tensorflow_text.SentencepieceTokenizer( self.sentencepiece_model) opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer( self.sentencepiece_model) input_text = [ u" ", u"to be or not to be", u"ignored by length text1", u"ignored by length text2" ] tftext_tokenized = tftext_sp.tokenize(input_text) # Check detokenizer tftext_detokenized = tftext_sp.detokenize(tftext_tokenized) opt_detokenized = opt_sp.detokenize(tftext_tokenized) self.assertAllEqual(tftext_detokenized, opt_detokenized)
def __init__(self, model, nbest_size=0, alpha=1.0): """Initializes the tokenizer. Args: model: Path to the SentencePiece model. nbest_size: Number of candidates to sample from (disabled during inference). alpha: Smoothing parameter for the sampling. """ super().__init__() self._nbest_size = nbest_size with tf.io.gfile.GFile(model, "rb") as model_file: self._tokenizer = tft.SentencepieceTokenizer( model=model_file.read(), out_type=tf.string, nbest_size=nbest_size, alpha=alpha, )
def __init__(self, vocab_file): sentence_model_voca = vocab_file sp_model = spm.SentencePieceProcessor() sp_proto = tf.io.gfile.GFile(sentence_model_voca, "rb").read() sp_model.LoadFromSerializedProto(sp_proto) self.vocab_size = sp_model.GetPieceSize() word_to_token = [sp_model.IdToPiece(i) for i in range(self.vocab_size)] self.word_start_subtoken = np.array( [sp_model.IdToPiece(i)[0] == "▁" for i in range(self.vocab_size)]) self.tf_tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(sentence_model_voca, "rb").read()) self.vocab = ko_bpe_vocab(word_to_token) self.inv_vocab = {v: k for k, v in self.vocab.items()}
def _tokenize_example(example): text, label = example["text"], example["label"] tokenizer = tft.SentencepieceTokenizer( model=tf.io.gfile.GFile(vocab_model_file, "rb").read()) if substitute_newline: text = tf.strings.regex_replace(text, "\n", substitute_newline) ids = tokenizer.tokenize(text) ids = ids[:max_encoder_length - 2] # Add [CLS] (65) and [SEP] (66) special tokens. prefix = tf.constant([65]) suffix = tf.constant([66]) ids = tf.concat([prefix, ids, suffix], axis=0) if isinstance(ids, tf.RaggedTensor): ids = ids.to_tensor(0) # tf.Example only supports tf.int64, but the TPU is better with tf.int32. label = tf.cast(label, tf.int32) return ids, label
def main(args: argparse.Namespace): logger = get_logger("make-tfrecord") input_files = glob.glob(args.dataset_paths) logger.info(f"[+] Number of Dataset Files: {len(input_files)}") # Load Config logger.info(f"[+] Load Config From {args.data_config}") config = DataConfig.from_yaml(args.data_config) # Load Sentencepiece model logger.info(f"[+] Load Tokenizer From {args.sp_model_path}") with open(args.sp_model_path, "rb") as f: tokenizer = text.SentencepieceTokenizer(f.read(), add_bos=True, add_eos=True) serialize = tf.function(lambda audio, text: tf.io.serialize_tensor( tf.stack([tf.io.serialize_tensor(audio), tf.io.serialize_tensor(text)]))) logger.info("[+] Start Saving Dataset...") for file_path in tqdm(input_files): output_dir = args.output_dir if args.output_dir else os.path.dirname( file_path) file_name = os.path.basename(file_path) output_path = os.path.join( output_dir, os.path.splitext(file_name)[0] + ".tfrecord") # Write TFRecordFile dataset = (get_dataset( file_path, config.file_format, config.sample_rate, tokenizer).map( config.audio_feature_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).map( serialize, num_parallel_calls=tf.data.experimental.AUTOTUNE)) writer = tf.data.experimental.TFRecordWriter(output_path, "GZIP") writer.write(dataset) logger.info("[+] Done")
def encode(text: tf.Tensor, max_len: int, vocab_filename: str, encoder_type: str): """EncodeOp.""" if encoder_type not in ["sentencepiece", "sentencepiece_newline"]: raise ValueError("Unsupported encoder type: %s" % encoder_type) sp_model = tf.gfile.GFile(vocab_filename, "rb").read() tokenizer = tf_text.SentencepieceTokenizer(model=sp_model) batch_size = text.shape[0] if encoder_type == "sentencepiece_newline": text = tf.strings.regex_replace(text, "\n", _NEWLINE_SYMBOL) ids = tokenizer.tokenize(text) eos = tf.ragged.constant([[1]] * batch_size) ids = tf.concat([ids, eos], axis=1) ids = ids.to_tensor(default_value=0) ids = ids[:, :max_len] pad = max_len - tf.shape(ids)[1] ids = tf.pad(ids, [[0, 0], [0, pad]]) ids.set_shape([ids.shape[0], max_len]) ids = tf.where(ids > 1, ids + _SHIFT_RESERVED_TOKENS, ids) ids = tf.cast(ids, tf.int64) return ids
def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None): super().__init__(params, logging_dir, name=name) self._sentencepiece_model_path = params.sentencepiece_model_path if params.sentencepiece_model_path: self._sp_tokenizer = tftxt.SentencepieceTokenizer( model=tf.io.gfile.GFile(params.sentencepiece_model_path, "rb").read(), add_eos=True) try: empty_str_tokenized = self._sp_tokenizer.tokenize("").numpy() except tf.errors.InternalError: raise ValueError( "EOS token not in tokenizer vocab." "Please make sure the tokenizer generates a single token for an " "empty string.") self._eos_id = empty_str_tokenized.item() self._vocab_size = self._sp_tokenizer.vocab_size().numpy() else: raise ValueError("Setencepiece model path not provided.") if (params.validation_data.input_path or params.validation_data.tfds_name) and self._logging_dir: self._references, self._tf_record_input_path = write_test_record( params.validation_data, self.logging_dir)
def benchmarkTokenizer(self): sp_model = _GetSentencepieceModel() test_text = [ "This week we celebrate the casts and creatives who have come together" " to bring us our favorite.", "More Stacks products demonstrated commitment to excellent support.", "Test, test, test." ] tftext_sp = tensorflow_text.SentencepieceTokenizer(sp_model) opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer(sp_model) iter_number = 1000 start = time.time() for _ in range(iter_number): _ = opt_sp.tokenize(test_text) self.report_benchmark(iters=iter_number, wall_time=time.time() - start, name="opt") start = time.time() for _ in range(iter_number): _ = tftext_sp.tokenize(test_text) self.report_benchmark(iters=iter_number, wall_time=time.time() - start, name="tf.text")
def __init__(self, bert_layer, max_len, min_len=1, CLS='[CLS]', SEP='[SEP]', PAD='[PAD]', UNK='[UNK]'): """ Initializes the layer :param CLS Token that represents the start of a sentence :param SEP Token that represents the end of a segment :param PAD Token that represents padding :param UNK Token that represents unknown tokens :param bert_layer Keras layer that loaded from pretrained BERT """ super().__init__() self._CLS = CLS self._SEP = SEP self._PAD = PAD self._min_len = min_len self._max_len = max_len resolved_object = bert_layer.resolved_object self.do_lower_case = resolved_object.do_lower_case.numpy() if hasattr(resolved_object, "tokenizer_type"): tokenizer_type_file = resolved_object.tokenizer_type.asset_path.numpy( ).decode("utf-8") with tf.io.gfile.GFile(tokenizer_type_file, 'r') as f_handler: self._tokenizer_type = f_handler.read().strip() tokenizer_file = resolved_object.tokenizer_file.asset_path.numpy( ).decode("utf-8") if self._tokenizer_type == SENTENCEPIECE: with tf.io.gfile.GFile(tokenizer_file, 'rb') as f_handler: sp_model = f_handler.read() self._tokenizer = tf_text.SentencepieceTokenizer( model=sp_model, out_type=tf.int32) self.vocab_table = create_tf_vocab_from_sp_tokenizer( self._tokenizer, num_oov_buckets=1) else: assert (self._tokenizer_type == SPACE) _, self.vocab_table = read_tf_vocab(tokenizer_file, UNK) else: vocab_file = resolved_object.vocab_file.asset_path.numpy().decode( "utf-8") _, self.vocab_table = create_tf_vocab_from_wp_tokenizer( vocab_file, num_oov_buckets=1) self._tokenizer = tf_text.BertTokenizer( self.vocab_table, token_out_type=tf.int64, lower_case=self.do_lower_case, unknown_token=UNK) self._tokenizer_type = WORDPIECE self._pad_id = self.vocab_table.lookup(tf.constant(PAD)) if PAD else -1 self._cls_id = self.vocab_table.lookup(tf.constant(CLS)) if CLS else -1 self._sep_id = self.vocab_table.lookup(tf.constant(SEP)) if SEP else -1 if self._tokenizer_type == SENTENCEPIECE: self._pad_id = tf.cast(self._pad_id, tf.int32) self._cls_id = tf.cast(self._cls_id, tf.int32) self._sep_id = tf.cast(self._sep_id, tf.int32)
def _create_tokenizer(self): return text.SentencepieceTokenizer(model=self._model_serialized_proto, out_type=tf.int32, nbest_size=self._nbest_size, alpha=self._alpha)
def tf_tokenizer(self): """Instantiate and return a TF tokenizer.""" return tf_text.SentencepieceTokenizer(model=self.sp_model)
def load_sentencepiece_model(model_proto): proto = tf.io.gfile.GFile(model_proto, 'rb').read() return tf_text.SentencepieceTokenizer(model=proto)
def __init__(self, vocab_file): super().__init__() serialized_proto = tf.compat.v1.gfile.GFile(vocab_file, "rb").read() self.tokenizer = tf_text.SentencepieceTokenizer( model=serialized_proto, add_bos=True, add_eos=True)
# fmt: on if __name__ == "__main__": args = parser.parse_args() strategy = get_device_strategy(args.device) logger = get_logger() if args.mixed_precision: policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy(policy) logger.info("Use Mixed Precision FP16") # Construct Dataset with tf.io.gfile.GFile(args.sp_model_path, "rb") as f: tokenizer = text.SentencepieceTokenizer(f.read(), add_bos=True, add_eos=True) dataset_files = tf.io.gfile.glob(args.dataset_path) if not dataset_files: logger.error("[Error] Dataset path is invalid!") sys.exit(1) if args.auto_encoding: scatter = lambda tokens: (tokens, tokens) dataset = ( tf.data.TextLineDataset(dataset_files, num_parallel_reads=tf.data.experimental.AUTOTUNE) .map(tokenizer.tokenize) .map(scatter) ) else: tokenize = lambda inputs, outputs: ((tokenizer.tokenize(inputs), tokenizer.tokenize(outputs))) dataset = tf.data.experimental.CsvDataset(
def __call__(self, x): # Constrained sequence cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0, 10.0]]) cs_input = np.array([cs_scores, cs_scores, cs_scores], dtype=np.float32) cs_transition_weights = np.array( [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0], [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0], [0.0, 1.0, 2.0, 3.0, 0.0]], dtype=np.float32) cs_allowed_transitions = np.array([[True, True, True, True, True], [True, True, True, True, True], [True, False, True, False, False], [True, True, True, True, True], [True, False, True, True, True]]) constrained_sequence = text.viterbi_constrained_sequence( cs_input, [2, 2, 2], allowed_transitions=cs_allowed_transitions, transition_weights=cs_transition_weights, use_log_space=True, use_start_and_end_states=True) # Max Spanning Tree mst_num_nodes = tf.constant([4, 3], tf.int32) mst_scores = tf.constant( [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]], [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]], tf.int32) # pyformat: disable (max_spanning_tree, _) = text.max_spanning_tree(mst_num_nodes, mst_scores) # Normalize normalized = text.case_fold_utf8(['A String']) normalized = text.normalize_utf8(normalized) # Regex split regex_split = text.regex_split(input=['Yo dawg!'], delim_regex_pattern=r'\s') # Rouge-L rl_hypotheses = tf.ragged.constant( [['captain', 'of', 'the', 'delta', 'flight'], ['the', '1990', 'transcript']]) rl_references = tf.ragged.constant( [['delta', 'air', 'lines', 'flight'], ['this', 'concludes', 'the', 'transcript']]) (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references) # Sentence breaking version 1 (token dependent) sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'], ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']] sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]] sb_token_starts = [] sb_token_ends = [] for sentence in sb_token_word: sentence_string = '' sentence_start = [] sentence_end = [] for word in sentence: sentence_start.append(len(sentence_string)) sentence_string = sentence_string.join([word, ' ']) sentence_end.append(len(sentence_string)) sb_token_starts.append(sentence_start) sb_token_ends.append(sentence_end) sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64) sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64) sb_token_properties = tf.ragged.constant(sb_token_properties, dtype=tf.int64) (sentence_breaking, _, _, _) = text.sentence_fragments(sb_token_word, sb_token_starts, sb_token_ends, sb_token_properties) # Sentence breaking version 2 (StateBasedSentenceBreaker) sbv2_text_input = [['Welcome to the U.S.! Harry'], ['Wu Tang Clan; ain\'t nothing']] sentence_breaker_v2 = text.StateBasedSentenceBreaker() sbv2_fragment_text, _, _ = ( sentence_breaker_v2.break_sentences_with_offsets(sbv2_text_input)) # Sentencepiece tokenizer sp_model_file = ( 'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model' ) sp_model = open(sp_model_file, 'rb').read() sp_tokenizer = text.SentencepieceTokenizer(sp_model) sentencepiece = sp_tokenizer.tokenize(['A sentence of things.']) sentencepiece = sp_tokenizer.detokenize(sentencepiece) (sentencepiece, _, _) = sp_tokenizer.tokenize_with_offsets(sentencepiece) sentencepiece_size = sp_tokenizer.vocab_size() sentencepiece_id = sp_tokenizer.id_to_string(1) # Split merge tokenizer sm_tokenizer = text.SplitMergeTokenizer() split_merge = sm_tokenizer.tokenize(b'IloveFlume!', [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0]) # Split merge from logits tokenizer smfl_tokenizer = text.SplitMergeFromLogitsTokenizer() split_merge_from_logits = smfl_tokenizer.tokenize( b'IloveFlume!', # One pair of logits for each Unicode character from the text. Each # pair indicates a "split" action if the first component is greater than # the second one, and a "merge" otherwise. [ [2.7, -0.3], # I: split [4.1, 0.82], # l: split [-2.3, 4.3], # o: merge [3.1, 12.2], # v: merge [-3.0, 4.7], # e: merge [2.7, -0.7], # F: split [0.7, 15.0], # l: merge [1.6, 23.0], # u: merge [2.1, 11.0], # m: merge [0.0, 20.0], # e: merge [18.0, 0.7], # !: split ]) # Confirm TF unicode_script op that requires ICU works tf_unicode_script = tf.strings.unicode_script( [ord('a'), 0x0411, 0x82b8, ord(',')]) # Unicode script tokenizer us_tokenizer = text.UnicodeScriptTokenizer() unicode_script = us_tokenizer.tokenize(['a string']) # Whitespace tokenizer ws_tokenizer = text.WhitespaceTokenizer() whitespace = ws_tokenizer.tokenize(['a string']) # Wordpiece tokenizer wp_initializer = tf.lookup.KeyValueTensorInitializer( ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64) self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer, default_value=-1) wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table) wordpiece = wp_tokenizer.tokenize(['i am']) # Wordshape wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')], text.WordShape.HAS_PUNCTUATION_DASH) # Assertion method def assert_check(tensor): return tf.assert_equal(tensor, tf.identity(tensor)) # Assertions constrained_sequence_assert = assert_check( constrained_sequence.to_tensor()) max_spanning_tree_assert = assert_check(max_spanning_tree) normalized_assert = assert_check(normalized) regex_split_assert = assert_check(regex_split.to_tensor()) rouge_l_assert = assert_check(rouge_l) sentence_breaking_assert = assert_check(sentence_breaking.to_tensor()) sentence_breaking_v2_assert = assert_check( sbv2_fragment_text.to_tensor()) sentencepiece_assert = assert_check(sentencepiece.to_tensor()) sentencepiece_id_assert = assert_check(sentencepiece_id) sentencepiece_size_assert = assert_check(sentencepiece_size) split_merge_assert = assert_check(split_merge) split_merge_from_logits_assert = assert_check(split_merge_from_logits) tf_unicode_script_assert = assert_check(tf_unicode_script) unicode_script_assert = assert_check(unicode_script.to_tensor()) whitespace_assert = assert_check(whitespace.to_tensor()) wordpiece_assert = assert_check(wordpiece.to_tensor()) wordshapes_assert = assert_check(wordshapes) with tf.control_dependencies([ constrained_sequence_assert, max_spanning_tree_assert, normalized_assert, regex_split_assert, rouge_l_assert, sentence_breaking_assert, sentence_breaking_v2_assert, sentencepiece_assert, sentencepiece_id_assert, sentencepiece_size_assert, split_merge_assert, split_merge_from_logits_assert, tf_unicode_script_assert, unicode_script_assert, whitespace_assert, wordpiece_assert, wordshapes_assert ]): y = tf.add(x, [1]) return {'y': y}