def test_encode_length(self): string = ["99 the quick brown fox.", "97 the quick brown"] ids = parsing_ops.encode(string, 10, _SUBWORDS, "subword", has_length_token=True) self.assertAllEqual([[99, 8, 9, 10, 11, 12, 38, 1, 0, 0], [97, 8, 9, 10, 11, 1, 0, 0, 0, 0]], ids)
def test_spm_prefix(self): string = ["25 the quick brown fox.", "23 the quick brown"] ids = parsing_ops.encode(string, 10, _SPM, "sentencepiece_newline", has_length_token=True) self.assertAllEqual(25, ids[0][0]) self.assertAllEqual(23, ids[1][0]) decodes = parsing_ops.decode(ids, _SPM, "sentencepiece_newline") self.assertAllEqual(["the quick brown fox.", "the quick brown"], decodes)
def test_encode(self): string = ["the quick brown fox.", "the quick brown"] ids = parsing_ops.encode(string, 10, _SUBWORDS, "subword") self.assertAllEqual([[8, 9, 10, 11, 12, 38, 1, 0, 0, 0], [8, 9, 10, 11, 1, 0, 0, 0, 0, 0]], ids)
def test_tf_decode(self, encoder_type): string = tf.constant(["the quick brown fox.", "the quick brown\n"]) ids = parsing_ops.encode(string, 10, _SPM_VOCAB, encoder_type) self.assertAllEqual( parsing_ops.decode(ids, _SPM_VOCAB, encoder_type), public_parsing_ops.decode(ids, _SPM_VOCAB, encoder_type))