def testSampleEncodeAndDecode(self): sentencepiece_model_file = self._getSentencePieceModelFile() processor = spm.SentencePieceProcessor() processor.Load(sentencepiece_model_file) sentences, _, _, _ = self._getExpected(processor) with tf.Session(): for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]: # Round trip test. nbest_size = tf.constant(n) alpha = tf.constant(a) s = tf.constant(sentences) pieces, seq_len1 = tfspm.encode( s, nbest_size=nbest_size, alpha=alpha, model_file=sentencepiece_model_file, out_type=tf.string) ids, seq_len2 = tfspm.encode( s, nbest_size=nbest_size, alpha=alpha, model_file=sentencepiece_model_file) decoded_sentences1 = tfspm.decode( pieces, seq_len1, model_file=sentencepiece_model_file) decoded_sentences2 = tfspm.decode( ids, seq_len2, model_file=sentencepiece_model_file) self.assertEqual(decoded_sentences1.eval().tolist(), sentences) self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
def testInvalidInput(self): sentences = ['Hello world.', 'This is a test.'] ids = [[0,1],[2,3]] model_file = self._getSentencePieceModelFile() with tf.Session() as sess: a = tf.constant(sentences) b = tf.constant(ids) alpha = tf.constant([1.0, 2.0]) sess.run(tfspm.encode( a, model_file=model_file, alpha=alpha, name='foo')) nbest_size = tf.constant([1, 2], dtype=tf.int32) sess.run(tfspm.encode( a, model_file=model_file, nbest_size=nbest_size, name='foo')) alpha = tf.constant(1.0) sess.run(tfspm.encode( a, model_file=model_file, alpha=alpha, name='foo')) nbest_size = tf.constant(10, dtype=tf.int32) sess.run(tfspm.encode( a, model_file=model_file, nbest_size=nbest_size, name='foo')) sess.run(tfspm.decode( b, sequence_length=tf.constant([2, 2]), model_file=model_file)) with self.assertRaises(ValueError): a = tf.constant(sentences) alpha = tf.constant([1.0, 2.0, 3.0]) sess.run(tfspm.encode( a, model_file=model_file, alpha=alpha)) with self.assertRaises(ValueError): a = tf.constant(sentences) nbest_size = tf.constant([1, 2, 3], dtype=tf.int32) sess.run(tfspm.encode( a, model_file=model_file, nbest_size=nbest_size)) with self.assertRaises(ValueError): a = tf.constant(sentences) alpha = tf.constant([[1.0], [2.0]]) sess.run(tfspm.encode( a, model_file=model_file, alpha=alpha)) with self.assertRaises(ValueError): a = tf.constant(sentences) nbest_size = tf.constant([[1], [2]], dtype=tf.int32) sess.run(tfspm.encode( a, model_file=model_file, nbest_size=nbest_size)) with self.assertRaises(ValueError): b = tf.constant(ids) sess.run(tfspm.decode( a, sequence_length=2, model_file=model_file)) with self.assertRaises(ValueError): b = tf.constant(ids) sess.run(tfspm.decode( a, sequence_length=tf.constant([2, 2, 2]), model_file=model_file))
def testEncodeAndDecode(self): sentencepiece_model_file = self._getSentencePieceModelFile() processor = spm.SentencePieceProcessor() processor.Load(sentencepiece_model_file) with tf.Session(): for reverse, add_bos, add_eos in list( it.product((True, False), repeat=3)): (sentences, expected_pieces, expected_ids, expected_seq_len) = self._getExpected(processor, reverse, add_bos, add_eos) # Encode sentences into pieces/ids. s = tf.constant(sentences) pieces, seq_len1 = tfspm.encode( s, model_file=sentencepiece_model_file, reverse=reverse, add_bos=add_bos, add_eos=add_eos, out_type=tf.string) ids, seq_len2 = tfspm.encode( s, model_file=sentencepiece_model_file, reverse=reverse, add_bos=add_bos, add_eos=add_eos) self.assertEqual(pieces.eval().tolist(), expected_pieces) self.assertEqual(ids.eval().tolist(), expected_ids) self.assertEqual(seq_len1.eval().tolist(), expected_seq_len) self.assertEqual(seq_len2.eval().tolist(), expected_seq_len) # Decode pieces into sentences/ids. pieces = tf.constant(expected_pieces) ids = tf.constant(expected_ids) seq_len = tf.constant(expected_seq_len, dtype=tf.int32) decoded_sentences1 = tfspm.decode( pieces, seq_len, model_file=sentencepiece_model_file, reverse=reverse) decoded_sentences2 = tfspm.decode( ids, seq_len, model_file=sentencepiece_model_file, reverse=reverse) self.assertEqual(decoded_sentences1.eval().tolist(), sentences) self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
def testEncodeAndDecode(self): sentencepiece_model_file = self._getSentencePieceModelFile() with tf.Session(): for reverse, add_bos, add_eos in list(it.product( (True, False), repeat=3)): (sentences, expected_pieces, expected_ids, expected_seq_len) = self._getExpected( reverse=reverse, add_bos=add_bos, add_eos=add_eos) # Encode sentences into pieces/ids. s = tf.constant(sentences) pieces, seq_len1 = tfspm.encode( s, model_file=sentencepiece_model_file, reverse=reverse, add_bos=add_bos, add_eos=add_eos, out_type=tf.string) ids, seq_len2 = tfspm.encode( s, model_file=sentencepiece_model_file, reverse=reverse, add_bos=add_bos, add_eos=add_eos) self.assertEqual(pieces.eval().tolist(), expected_pieces) self.assertEqual(ids.eval().tolist(), expected_ids) self.assertEqual(seq_len1.eval().tolist(), expected_seq_len) self.assertEqual(seq_len2.eval().tolist(), expected_seq_len) # Decode pieces into sentences/ids. pieces = tf.constant(expected_pieces) ids = tf.constant(expected_ids) seq_len = tf.constant(expected_seq_len, dtype=tf.int32) decoded_sentences1 = tfspm.decode( pieces, seq_len, model_file=sentencepiece_model_file, reverse=reverse) decoded_sentences2 = tfspm.decode( ids, seq_len, model_file=sentencepiece_model_file, reverse=reverse) self.assertEqual(decoded_sentences1.eval().tolist(), sentences) self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
def testSampleEncodeAndDecode(self): sentencepiece_model_file = self._getSentencePieceModelFile() sentences, _, _, _ = self._getExpected() with tf.Session(): for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]: # Round trip test. nbest_size = tf.constant(n) alpha = tf.constant(a) s = tf.constant(sentences) pieces, seq_len1 = tfspm.encode( s, nbest_size=nbest_size, alpha=alpha, model_file=sentencepiece_model_file, out_type=tf.string) ids, seq_len2 = tfspm.encode( s, nbest_size=nbest_size, alpha=alpha, model_file=sentencepiece_model_file) decoded_sentences1 = tfspm.decode( pieces, seq_len1, model_file=sentencepiece_model_file) decoded_sentences2 = tfspm.decode( ids, seq_len2, model_file=sentencepiece_model_file) self.assertEqual(decoded_sentences1.eval().tolist(), sentences) self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
def testInvalidInput(self): sentences = ['Hello world.', 'This is a test.'] ids = [[0, 1], [2, 3]] model_file = self._getSentencePieceModelFile() with tf.Session() as sess: a = tf.constant(sentences) b = tf.constant(ids) alpha = tf.constant([1.0, 2.0]) sess.run( tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo')) nbest_size = tf.constant([1, 2], dtype=tf.int32) sess.run( tfspm.encode(a, model_file=model_file, nbest_size=nbest_size, name='foo')) alpha = tf.constant(1.0) sess.run( tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo')) nbest_size = tf.constant(10, dtype=tf.int32) sess.run( tfspm.encode(a, model_file=model_file, nbest_size=nbest_size, name='foo')) sess.run( tfspm.decode(b, sequence_length=tf.constant([2, 2]), model_file=model_file)) with self.assertRaises(ValueError): a = tf.constant(sentences) alpha = tf.constant([1.0, 2.0, 3.0]) sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha)) with self.assertRaises(ValueError): a = tf.constant(sentences) nbest_size = tf.constant([1, 2, 3], dtype=tf.int32) sess.run( tfspm.encode(a, model_file=model_file, nbest_size=nbest_size)) with self.assertRaises(ValueError): a = tf.constant(sentences) alpha = tf.constant([[1.0], [2.0]]) sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha)) with self.assertRaises(ValueError): a = tf.constant(sentences) nbest_size = tf.constant([[1], [2]], dtype=tf.int32) sess.run( tfspm.encode(a, model_file=model_file, nbest_size=nbest_size)) with self.assertRaises(ValueError): b = tf.constant(ids) sess.run( tfspm.decode(a, sequence_length=2, model_file=model_file)) with self.assertRaises(ValueError): b = tf.constant(ids) sess.run( tfspm.decode(a, sequence_length=tf.constant([2, 2, 2]), model_file=model_file))
def predict_batch(sess, src, model, src_model_file, tar_model_file, src_offset, tar_offset, srcf, tarf, vocab_size, single_vocab_size=8192, batch_size=60): """ sess: tf.Session src: list of strings model: tf.keras.Model """ t = len(src) ans = [] for i in range(t // batch_size): print(i) start = i * batch_size end = start + batch_size inp = src[start:end] a = tfs.encode(inp, model_file=src_model_file, add_bos=True, add_eos=True)[0] if src_offset > 0: a_mask = tf.cast(tf.not_equal(a, 0), tf.int32) * src_offset a = a + a_mask ids, probs = predict( model=model, inputs=a, inpf=tf.constant(srcf), tarf=tf.constant(tarf), bos_id=tar_offset + 1, eos_id=tar_offset + 2, beam_size=5, vocab_size=vocab_size, alpha=1.0, ) mask = tf.cast(tf.not_equal(ids, 0), tf.int32) seq_len = tf.reduce_sum(mask, axis=-1) if tar_offset > 0: ids = ids + mask * -tar_offset probs = tf.math.exp(probs) ids_, seq_len_ = sess.run([ids, seq_len]) for cids, cseqlen in zip(list(ids_), list(seq_len_)): fids = tf.cast( tf.logical_and(tf.greater(cids, 0), tf.less(cids, single_vocab_size)), tf.int32) * cids decoded = sess.run( tfs.decode(fids, cseqlen, model_file=tar_model_file)) ans.append(decoded) return ans