def testStringToIdVector(self):
     sp = SentencepieceTokenizer(self.model)
     pieces = _utf8([['▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't'],
                     ['▁I', '▁l', 'o', 've', '▁desk', '.'],
                     ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']])
     ids = [[9, 169, 21, 125, 78, 48, 132, 15], [9, 169, 21, 125, 727, 6],
            [9, 169, 21, 125, 169, 579, 6]]
     result = sp.string_to_id(ragged_factory_ops.constant(pieces))
     self.assertAllEqual(ids, result)
示例#2
0
 def testStringToIdRagged(self):
   sp = SentencepieceTokenizer(self.model)
   pieces = _utf8(
       [[['▁I', '▁l', 'o', 've', '▁c', 'ar', 'pe', 't'],
         ['▁I', '▁l', 'o', 've', '▁desk', '.'],
         ['▁I', '▁l', 'o', 've', '▁l', 'amp', '.']],
        [['▁', 'N', 'ever', '▁tell', '▁me', '▁the', '▁', 'o', 'd', 'd', 's']]])
   ids = [[[9, 169, 21, 125, 78, 48, 132, 15], [9, 169, 21, 125, 727, 6],
           [9, 169, 21, 125, 169, 579, 6]],
          [[4, 199, 363, 310, 33, 7, 4, 21, 17, 17, 8]]]
   result = sp.string_to_id(ragged_factory_ops.constant(pieces, dtypes.string))
   self.assertAllEqual(ids, result)
 def testStringToIdScalar(self):
     sp = SentencepieceTokenizer(self.model)
     result = sp.string_to_id('</s>')
     self.assertAllEqual(2, result)