def testSerialization(self): with context.eager_mode(): sentencepiece_model_file = ( 'tensorflow_text/python/ops/test_data/' 'test_oss_model.model') model = gfile.GFile(sentencepiece_model_file, 'rb').read() sp = SentencepieceTokenizer(model) strings = ['hello', 'world'] dataset = dataset_ops.Dataset.from_tensor_slices(strings) # Ensure we can map the tokenizer across the dataset. dataset = dataset.map(sp.tokenize) graph = dataset._as_serialized_graph() element_spec = dataset.element_spec dataset_graph_string = graph.numpy() expected = sp.tokenize(strings) # Reset the eager context to make sure that the serialized dataset graph # is self-contained. context._reset_context() with context.eager_mode(): restored = dataset_ops.from_variant( gen_experimental_dataset_ops.dataset_from_graph(dataset_graph_string), element_spec) for i, result in enumerate(restored): self.assertAllEqual(result, expected[i])
def testTokenizeAndDetokenizeWithOffsetsMatrix(self): sp = SentencepieceTokenizer(self.model, out_type=dtypes.string) sentences = [['I love carpet.', 'I love desk.', 'I love lamp.'], ['Never tell me the odds']] expected_tokens = [[[ '▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't', '.' ], ['▁I', '▁l', 'o', 've', '▁desk', '.'], ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']], [[ '▁', 'N', 'ever', '▁tell', '▁me', '▁the', '▁', 'o', 'd', 'd', 's' ]]] expected_tokens = _utf8(expected_tokens) expected_starts = [[[0, 1, 3, 4, 6, 8, 10, 12, 13], [0, 1, 3, 4, 6, 11], [0, 1, 3, 4, 6, 8, 11]], [[0, 0, 1, 5, 10, 13, 17, 18, 19, 20, 21]]] expected_limits = [[[1, 3, 4, 6, 8, 10, 12, 13, 14], [1, 3, 4, 6, 11, 12], [1, 3, 4, 6, 8, 11, 12]], [[0, 1, 5, 10, 13, 17, 18, 19, 20, 21, 22]]] (tokens, starts, limits) = sp.tokenize_with_offsets( ragged_factory_ops.constant(sentences)) self.assertAllEqual(expected_tokens, tokens) self.assertAllEqual(expected_starts, starts) self.assertAllEqual(expected_limits, limits) detokenized = sp.detokenize(tokens) self.assertAllEqual(_utf8(sentences), detokenized)
def testSampleTokenizeAndDetokenize(self, nbest_size, alpha, out_type): sp = SentencepieceTokenizer( self.model, nbest_size=nbest_size, alpha=alpha, out_type=out_type) sentences = [['I love carpet', 'I love desk.', 'I love lamp.'], ['Never tell me the odds']] result = sp.tokenize(ragged_factory_ops.constant(sentences)) detokenized = sp.detokenize(result) self.assertAllEqual(_utf8(sentences), detokenized)
def testIdToStringVector(self): sp = SentencepieceTokenizer(self.model) pieces = _utf8([['▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't'], ['▁I', '▁l', 'o', 've', '▁desk', '.'], ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']]) ids = [[9, 169, 21, 125, 78, 48, 132, 15], [9, 169, 21, 125, 727, 6], [9, 169, 21, 125, 169, 579, 6]] result = sp.id_to_string(ragged_factory_ops.constant(ids)) self.assertAllEqual(pieces, result)
def testReturnNbestAndDetokenize(self): sp = SentencepieceTokenizer( self.model, nbest_size=2, out_type=dtypes.int32, return_nbest=True) sentences = ['I love carpet', 'Never tell me the odds'] result = sp.tokenize(ragged_factory_ops.constant(sentences)) detokenized = sp.detokenize(result) self.assertAllEqual( _utf8(sentences), ragged_gather_ops.gather(detokenized, [0, 2])) self.assertAllEqual( _utf8(sentences), ragged_gather_ops.gather(detokenized, [1, 3]))
def testTokenizeAndDetokenizeWithOffsetsSingleElementVector(self): sp = SentencepieceTokenizer(self.model, out_type=dtypes.string) sentences = ['I love lamp.'] expected_tokens = [['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']] expected_tokens = _utf8(expected_tokens) expected_starts = [[0, 1, 3, 4, 6, 8, 11]] expected_limits = [[1, 3, 4, 6, 8, 11, 12]] (tokens, starts, limits) = sp.tokenize_with_offsets( ragged_factory_ops.constant(sentences)) self.assertRaggedEqual(expected_tokens, tokens) self.assertRaggedEqual(expected_starts, starts) self.assertRaggedEqual(expected_limits, limits)
def testStringToIdRagged(self): sp = SentencepieceTokenizer(self.model) pieces = _utf8( [[['▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't'], ['▁I', '▁l', 'o', 've', '▁desk', '.'], ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']], [['▁', 'N', 'ever', '▁tell', '▁me', '▁the', '▁', 'o', 'd', 'd', 's']]]) ids = [[[9, 169, 21, 125, 78, 48, 132, 15], [9, 169, 21, 125, 727, 6], [9, 169, 21, 125, 169, 579, 6]], [[4, 199, 363, 310, 33, 7, 4, 21, 17, 17, 8]]] result = sp.string_to_id(ragged_factory_ops.constant(pieces, dtypes.string)) self.assertAllEqual(ids, result)
def testTokenizeAndDetokenizeWithOffsetsVector(self): sp = SentencepieceTokenizer(self.model, out_type=dtypes.string) sentences = ['I love carpet.', 'I love desk.', 'I love lamp.'] expected_tokens = [['▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't', '.'], ['▁I', '▁l', 'o', 've', '▁desk', '.'], ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']] expected_tokens = _utf8(expected_tokens) expected_starts = [[0, 1, 3, 4, 6, 8, 10, 12, 13], [0, 1, 3, 4, 6, 11], [0, 1, 3, 4, 6, 8, 11]] expected_limits = [[1, 3, 4, 6, 8, 10, 12, 13, 14], [1, 3, 4, 6, 11, 12], [1, 3, 4, 6, 8, 11, 12]] (tokens, starts, limits) = sp.tokenize_with_offsets( ragged_factory_ops.constant(sentences)) self.assertRaggedEqual(expected_tokens, tokens) self.assertRaggedEqual(expected_starts, starts) self.assertRaggedEqual(expected_limits, limits)
def getTokenizerAndSetOptions(self, reverse, add_bos, add_eos, out_type): return SentencepieceTokenizer( self.model, reverse=reverse, add_bos=add_bos, add_eos=add_eos, out_type=out_type)
def testBasicPipeline(self): if not context.executing_eagerly(): self.skipTest('testBasicPipeline only supported in eager mode.') sp = SentencepieceTokenizer(self.model) strings = ['hello', 'world'] dataset = dataset_ops.Dataset.from_tensor_slices(strings) # Ensure we can map the tokenizer across the dataset. dataset1 = dataset.map(sp.tokenize) # Ensure there's no error with a second map call. dataset2 = dataset.map(sp.tokenize) expected = sp.tokenize(strings) for i, result in enumerate(dataset1): self.assertAllEqual(result, expected[i]) for i, result in enumerate(dataset2): self.assertAllEqual(result, expected[i])
def testSavedModel(self): sp = SentencepieceTokenizer(self.model) test_module = TestSavedModelModule(sp) inputs = constant_op.constant(['hello world']) expected_result = test_module.tokenize(inputs) temp_dir = tempfile.mkdtemp(dir=test.get_temp_dir()) save.save(test_module, temp_dir) restored_model = load.load(temp_dir) self.assertAllEqual(restored_model.tokenize(inputs), expected_result) file_io.delete_recursively(temp_dir)
def testGetVocabSize(self): sp = SentencepieceTokenizer(self.model) self.assertAllEqual(1000, sp.vocab_size())
def testInvalidModel(self): with self.cached_session(): with self.assertRaises(errors.InternalError): sp = SentencepieceTokenizer('invalid model') result = sp.tokenize('whatever') result.eval()
def testEmptyModel(self): with self.cached_session(): with self.assertRaises(errors.InvalidArgumentError): sp = SentencepieceTokenizer() result = sp.tokenize('whatever') result.eval()
def testIdToStringScalar(self): sp = SentencepieceTokenizer(self.model) result = sp.id_to_string(125) self.assertAllEqual('ve', result)
def testStringToIdScalar(self): sp = SentencepieceTokenizer(self.model) result = sp.string_to_id('</s>') self.assertAllEqual(2, result)