示例#1
0
  def testSerialization(self):
    with context.eager_mode():
      sentencepiece_model_file = (
          'tensorflow_text/python/ops/test_data/'
          'test_oss_model.model')
      model = gfile.GFile(sentencepiece_model_file, 'rb').read()
      sp = SentencepieceTokenizer(model)
      strings = ['hello', 'world']
      dataset = dataset_ops.Dataset.from_tensor_slices(strings)
      # Ensure we can map the tokenizer across the dataset.
      dataset = dataset.map(sp.tokenize)
      graph = dataset._as_serialized_graph()
      element_spec = dataset.element_spec
      dataset_graph_string = graph.numpy()
      expected = sp.tokenize(strings)

    # Reset the eager context to make sure that the serialized dataset graph
    # is self-contained.
    context._reset_context()

    with context.eager_mode():
      restored = dataset_ops.from_variant(
          gen_experimental_dataset_ops.dataset_from_graph(dataset_graph_string),
          element_spec)
      for i, result in enumerate(restored):
        self.assertAllEqual(result, expected[i])
示例#2
0
 def testTokenizeAndDetokenizeWithOffsetsMatrix(self):
     sp = SentencepieceTokenizer(self.model, out_type=dtypes.string)
     sentences = [['I love carpet.', 'I love desk.', 'I love lamp.'],
                  ['Never tell me the odds']]
     expected_tokens = [[[
         '▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't', '.'
     ], ['▁I', '▁l', 'o', 've', '▁desk', '.'],
                         ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']],
                        [[
                            '▁', 'N', 'ever', '▁tell', '▁me', '▁the', '▁',
                            'o', 'd', 'd', 's'
                        ]]]
     expected_tokens = _utf8(expected_tokens)
     expected_starts = [[[0, 1, 3, 4, 6, 8, 10, 12, 13],
                         [0, 1, 3, 4, 6, 11], [0, 1, 3, 4, 6, 8, 11]],
                        [[0, 0, 1, 5, 10, 13, 17, 18, 19, 20, 21]]]
     expected_limits = [[[1, 3, 4, 6, 8, 10, 12, 13, 14],
                         [1, 3, 4, 6, 11, 12], [1, 3, 4, 6, 8, 11, 12]],
                        [[0, 1, 5, 10, 13, 17, 18, 19, 20, 21, 22]]]
     (tokens, starts, limits) = sp.tokenize_with_offsets(
         ragged_factory_ops.constant(sentences))
     self.assertAllEqual(expected_tokens, tokens)
     self.assertAllEqual(expected_starts, starts)
     self.assertAllEqual(expected_limits, limits)
     detokenized = sp.detokenize(tokens)
     self.assertAllEqual(_utf8(sentences), detokenized)
示例#3
0
 def testSampleTokenizeAndDetokenize(self, nbest_size, alpha, out_type):
   sp = SentencepieceTokenizer(
       self.model, nbest_size=nbest_size, alpha=alpha, out_type=out_type)
   sentences = [['I love carpet', 'I love desk.', 'I love lamp.'],
                ['Never tell me the odds']]
   result = sp.tokenize(ragged_factory_ops.constant(sentences))
   detokenized = sp.detokenize(result)
   self.assertAllEqual(_utf8(sentences), detokenized)
示例#4
0
 def testIdToStringVector(self):
     sp = SentencepieceTokenizer(self.model)
     pieces = _utf8([['▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't'],
                     ['▁I', '▁l', 'o', 've', '▁desk', '.'],
                     ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']])
     ids = [[9, 169, 21, 125, 78, 48, 132, 15], [9, 169, 21, 125, 727, 6],
            [9, 169, 21, 125, 169, 579, 6]]
     result = sp.id_to_string(ragged_factory_ops.constant(ids))
     self.assertAllEqual(pieces, result)
示例#5
0
 def testReturnNbestAndDetokenize(self):
   sp = SentencepieceTokenizer(
       self.model, nbest_size=2, out_type=dtypes.int32, return_nbest=True)
   sentences = ['I love carpet', 'Never tell me the odds']
   result = sp.tokenize(ragged_factory_ops.constant(sentences))
   detokenized = sp.detokenize(result)
   self.assertAllEqual(
       _utf8(sentences), ragged_gather_ops.gather(detokenized, [0, 2]))
   self.assertAllEqual(
       _utf8(sentences), ragged_gather_ops.gather(detokenized, [1, 3]))
 def testTokenizeAndDetokenizeWithOffsetsSingleElementVector(self):
     sp = SentencepieceTokenizer(self.model, out_type=dtypes.string)
     sentences = ['I love lamp.']
     expected_tokens = [['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']]
     expected_tokens = _utf8(expected_tokens)
     expected_starts = [[0, 1, 3, 4, 6, 8, 11]]
     expected_limits = [[1, 3, 4, 6, 8, 11, 12]]
     (tokens, starts, limits) = sp.tokenize_with_offsets(
         ragged_factory_ops.constant(sentences))
     self.assertRaggedEqual(expected_tokens, tokens)
     self.assertRaggedEqual(expected_starts, starts)
     self.assertRaggedEqual(expected_limits, limits)
示例#7
0
 def testStringToIdRagged(self):
   sp = SentencepieceTokenizer(self.model)
   pieces = _utf8(
       [[['▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't'],
         ['▁I', '▁l', 'o', 've', '▁desk', '.'],
         ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']],
        [['▁', 'N', 'ever', '▁tell', '▁me', '▁the', '▁', 'o', 'd', 'd', 's']]])
   ids = [[[9, 169, 21, 125, 78, 48, 132, 15], [9, 169, 21, 125, 727, 6],
           [9, 169, 21, 125, 169, 579, 6]],
          [[4, 199, 363, 310, 33, 7, 4, 21, 17, 17, 8]]]
   result = sp.string_to_id(ragged_factory_ops.constant(pieces, dtypes.string))
   self.assertAllEqual(ids, result)
 def testTokenizeAndDetokenizeWithOffsetsVector(self):
     sp = SentencepieceTokenizer(self.model, out_type=dtypes.string)
     sentences = ['I love carpet.', 'I love desk.', 'I love lamp.']
     expected_tokens = [['▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't', '.'],
                        ['▁I', '▁l', 'o', 've', '▁desk', '.'],
                        ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']]
     expected_tokens = _utf8(expected_tokens)
     expected_starts = [[0, 1, 3, 4, 6, 8, 10, 12, 13], [0, 1, 3, 4, 6, 11],
                        [0, 1, 3, 4, 6, 8, 11]]
     expected_limits = [[1, 3, 4, 6, 8, 10, 12, 13, 14],
                        [1, 3, 4, 6, 11, 12], [1, 3, 4, 6, 8, 11, 12]]
     (tokens, starts, limits) = sp.tokenize_with_offsets(
         ragged_factory_ops.constant(sentences))
     self.assertRaggedEqual(expected_tokens, tokens)
     self.assertRaggedEqual(expected_starts, starts)
     self.assertRaggedEqual(expected_limits, limits)
 def getTokenizerAndSetOptions(self, reverse, add_bos, add_eos, out_type):
   return SentencepieceTokenizer(
       self.model,
       reverse=reverse,
       add_bos=add_bos,
       add_eos=add_eos,
       out_type=out_type)
示例#10
0
    def testBasicPipeline(self):
        if not context.executing_eagerly():
            self.skipTest('testBasicPipeline only supported in eager mode.')

        sp = SentencepieceTokenizer(self.model)

        strings = ['hello', 'world']
        dataset = dataset_ops.Dataset.from_tensor_slices(strings)
        # Ensure we can map the tokenizer across the dataset.
        dataset1 = dataset.map(sp.tokenize)
        # Ensure there's no error with a second map call.
        dataset2 = dataset.map(sp.tokenize)

        expected = sp.tokenize(strings)
        for i, result in enumerate(dataset1):
            self.assertAllEqual(result, expected[i])
        for i, result in enumerate(dataset2):
            self.assertAllEqual(result, expected[i])
示例#11
0
 def testSavedModel(self):
     sp = SentencepieceTokenizer(self.model)
     test_module = TestSavedModelModule(sp)
     inputs = constant_op.constant(['hello world'])
     expected_result = test_module.tokenize(inputs)
     temp_dir = tempfile.mkdtemp(dir=test.get_temp_dir())
     save.save(test_module, temp_dir)
     restored_model = load.load(temp_dir)
     self.assertAllEqual(restored_model.tokenize(inputs), expected_result)
     file_io.delete_recursively(temp_dir)
示例#12
0
 def testGetVocabSize(self):
     sp = SentencepieceTokenizer(self.model)
     self.assertAllEqual(1000, sp.vocab_size())
示例#13
0
 def testInvalidModel(self):
     with self.cached_session():
         with self.assertRaises(errors.InternalError):
             sp = SentencepieceTokenizer('invalid model')
             result = sp.tokenize('whatever')
             result.eval()
示例#14
0
 def testEmptyModel(self):
     with self.cached_session():
         with self.assertRaises(errors.InvalidArgumentError):
             sp = SentencepieceTokenizer()
             result = sp.tokenize('whatever')
             result.eval()
示例#15
0
 def testIdToStringScalar(self):
     sp = SentencepieceTokenizer(self.model)
     result = sp.id_to_string(125)
     self.assertAllEqual('ve', result)
 def testStringToIdScalar(self):
     sp = SentencepieceTokenizer(self.model)
     result = sp.string_to_id('</s>')
     self.assertAllEqual(2, result)