def testSampleEncodeAndDecode(self): sentencepiece_model_file = self._getSentencePieceModelFile() processor = spm.SentencePieceProcessor() processor.Load(sentencepiece_model_file) sentences, _, _, _ = self._getExpected(processor) with tf.Session(): for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]: # Round trip test. nbest_size = tf.constant(n) alpha = tf.constant(a) s = tf.constant(sentences) pieces, seq_len1 = tfspm.encode( s, nbest_size=nbest_size, alpha=alpha, model_file=sentencepiece_model_file, out_type=tf.string) ids, seq_len2 = tfspm.encode( s, nbest_size=nbest_size, alpha=alpha, model_file=sentencepiece_model_file) decoded_sentences1 = tfspm.decode( pieces, seq_len1, model_file=sentencepiece_model_file) decoded_sentences2 = tfspm.decode( ids, seq_len2, model_file=sentencepiece_model_file) self.assertEqual(decoded_sentences1.eval().tolist(), sentences) self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
def encode_sentencepiece(dataset, a_model_proto, b_model_proto, a_offset, b_offset): return dataset.map(lambda a, b: ( tfs.encode(tf.expand_dims(a, 0), model_proto=a_model_proto, add_bos=True, add_eos=True)[0][0] + a_offset, tfs.encode(tf.expand_dims(b, 0), model_proto=b_model_proto, add_bos=True, add_eos=True)[0][0] + b_offset, ))
def testEncodeAndDecode(self): sentencepiece_model_file = self._getSentencePieceModelFile() processor = spm.SentencePieceProcessor() processor.Load(sentencepiece_model_file) with tf.Session(): for reverse, add_bos, add_eos in list( it.product((True, False), repeat=3)): (sentences, expected_pieces, expected_ids, expected_seq_len) = self._getExpected(processor, reverse, add_bos, add_eos) # Encode sentences into pieces/ids. s = tf.constant(sentences) pieces, seq_len1 = tfspm.encode( s, model_file=sentencepiece_model_file, reverse=reverse, add_bos=add_bos, add_eos=add_eos, out_type=tf.string) ids, seq_len2 = tfspm.encode( s, model_file=sentencepiece_model_file, reverse=reverse, add_bos=add_bos, add_eos=add_eos) self.assertEqual(pieces.eval().tolist(), expected_pieces) self.assertEqual(ids.eval().tolist(), expected_ids) self.assertEqual(seq_len1.eval().tolist(), expected_seq_len) self.assertEqual(seq_len2.eval().tolist(), expected_seq_len) # Decode pieces into sentences/ids. pieces = tf.constant(expected_pieces) ids = tf.constant(expected_ids) seq_len = tf.constant(expected_seq_len, dtype=tf.int32) decoded_sentences1 = tfspm.decode( pieces, seq_len, model_file=sentencepiece_model_file, reverse=reverse) decoded_sentences2 = tfspm.decode( ids, seq_len, model_file=sentencepiece_model_file, reverse=reverse) self.assertEqual(decoded_sentences1.eval().tolist(), sentences) self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
def testInvalidModelProto(self): with tf.Session() as sess: with self.assertRaises(tf.errors.InternalError): sentences = ['Hello world.'] a = tf.constant(sentences) sess.run(tfspm.encode( a, model_proto='invalid proto', out_type=tf.string))
def testInvalidModelPath(self): with tf.Session() as sess: with self.assertRaises(tf.errors.NotFoundError): sentences = ['Hello world.'] a = tf.constant(sentences) sess.run(tfspm.encode( a, model_file='invalid path', out_type=tf.string))
def testLoadModelProto(self): # Makes a serialized model proto. model_proto = open(self._getSentencePieceModelFile(), 'rb').read() with tf.Session() as sess: sentences = ['Hello world.'] a = tf.constant(sentences) sess.run( tfspm.encode(a, model_proto=model_proto, out_type=tf.string))
def testInvalidInput(self): sentences = ['Hello world.', 'This is a test.'] ids = [[0,1],[2,3]] model_file = self._getSentencePieceModelFile() with tf.Session() as sess: a = tf.constant(sentences) b = tf.constant(ids) alpha = tf.constant([1.0, 2.0]) sess.run(tfspm.encode( a, model_file=model_file, alpha=alpha, name='foo')) nbest_size = tf.constant([1, 2], dtype=tf.int32) sess.run(tfspm.encode( a, model_file=model_file, nbest_size=nbest_size, name='foo')) alpha = tf.constant(1.0) sess.run(tfspm.encode( a, model_file=model_file, alpha=alpha, name='foo')) nbest_size = tf.constant(10, dtype=tf.int32) sess.run(tfspm.encode( a, model_file=model_file, nbest_size=nbest_size, name='foo')) sess.run(tfspm.decode( b, sequence_length=tf.constant([2, 2]), model_file=model_file)) with self.assertRaises(ValueError): a = tf.constant(sentences) alpha = tf.constant([1.0, 2.0, 3.0]) sess.run(tfspm.encode( a, model_file=model_file, alpha=alpha)) with self.assertRaises(ValueError): a = tf.constant(sentences) nbest_size = tf.constant([1, 2, 3], dtype=tf.int32) sess.run(tfspm.encode( a, model_file=model_file, nbest_size=nbest_size)) with self.assertRaises(ValueError): a = tf.constant(sentences) alpha = tf.constant([[1.0], [2.0]]) sess.run(tfspm.encode( a, model_file=model_file, alpha=alpha)) with self.assertRaises(ValueError): a = tf.constant(sentences) nbest_size = tf.constant([[1], [2]], dtype=tf.int32) sess.run(tfspm.encode( a, model_file=model_file, nbest_size=nbest_size)) with self.assertRaises(ValueError): b = tf.constant(ids) sess.run(tfspm.decode( a, sequence_length=2, model_file=model_file)) with self.assertRaises(ValueError): b = tf.constant(ids) sess.run(tfspm.decode( a, sequence_length=tf.constant([2, 2, 2]), model_file=model_file))
def testInvalidModelProto(self): with tf.Session() as sess: with self.assertRaises(tf.errors.InternalError): sentences = ['Hello world.'] a = tf.constant(sentences) sess.run( tfspm.encode(a, model_proto='invalid proto', out_type=tf.string))
def testInvalidModelPath(self): with tf.Session() as sess: with self.assertRaises(tf.errors.NotFoundError): sentences = ['Hello world.'] a = tf.constant(sentences) sess.run( tfspm.encode(a, model_file='invalid path', out_type=tf.string))
def testLoadModelProto(self): # Makes a serialized model proto. model_proto = open(self._getSentencePieceModelFile(), 'rb').read() with tf.Session() as sess: sentences = ['Hello world.'] a = tf.constant(sentences) sess.run(tfspm.encode( a, model_proto=model_proto, out_type=tf.string))
def testEncodeAndDecode(self): sentencepiece_model_file = self._getSentencePieceModelFile() with tf.Session(): for reverse, add_bos, add_eos in list(it.product( (True, False), repeat=3)): (sentences, expected_pieces, expected_ids, expected_seq_len) = self._getExpected( reverse=reverse, add_bos=add_bos, add_eos=add_eos) # Encode sentences into pieces/ids. s = tf.constant(sentences) pieces, seq_len1 = tfspm.encode( s, model_file=sentencepiece_model_file, reverse=reverse, add_bos=add_bos, add_eos=add_eos, out_type=tf.string) ids, seq_len2 = tfspm.encode( s, model_file=sentencepiece_model_file, reverse=reverse, add_bos=add_bos, add_eos=add_eos) self.assertEqual(pieces.eval().tolist(), expected_pieces) self.assertEqual(ids.eval().tolist(), expected_ids) self.assertEqual(seq_len1.eval().tolist(), expected_seq_len) self.assertEqual(seq_len2.eval().tolist(), expected_seq_len) # Decode pieces into sentences/ids. pieces = tf.constant(expected_pieces) ids = tf.constant(expected_ids) seq_len = tf.constant(expected_seq_len, dtype=tf.int32) decoded_sentences1 = tfspm.decode( pieces, seq_len, model_file=sentencepiece_model_file, reverse=reverse) decoded_sentences2 = tfspm.decode( ids, seq_len, model_file=sentencepiece_model_file, reverse=reverse) self.assertEqual(decoded_sentences1.eval().tolist(), sentences) self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
def testSampleEncodeAndDecode(self): sentencepiece_model_file = self._getSentencePieceModelFile() sentences, _, _, _ = self._getExpected() with tf.Session(): for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]: # Round trip test. nbest_size = tf.constant(n) alpha = tf.constant(a) s = tf.constant(sentences) pieces, seq_len1 = tfspm.encode( s, nbest_size=nbest_size, alpha=alpha, model_file=sentencepiece_model_file, out_type=tf.string) ids, seq_len2 = tfspm.encode( s, nbest_size=nbest_size, alpha=alpha, model_file=sentencepiece_model_file) decoded_sentences1 = tfspm.decode( pieces, seq_len1, model_file=sentencepiece_model_file) decoded_sentences2 = tfspm.decode( ids, seq_len2, model_file=sentencepiece_model_file) self.assertEqual(decoded_sentences1.eval().tolist(), sentences) self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
def prepare_datasets(batch_size, dataset_size=25000): en_fr = tfds.load("para_crawl/enfr_plain_text", as_supervised=True, split=tfds.Split.TRAIN, as_dataset_kwargs=dict(shuffle_files=True), data_dir=flags.FLAGS.tfds_data_dir) en_de = tfds.load("para_crawl/ende_plain_text", as_supervised=True, split=tfds.Split.TRAIN, as_dataset_kwargs=dict(shuffle_files=True), data_dir=flags.FLAGS.tfds_data_dir) fr_pt = tfds.load("ted_hrlr_translate/fr_to_pt", as_supervised=True, as_dataset_kwargs=dict(shuffle_files=True), data_dir=flags.FLAGS.tfds_data_dir) pt_en = tfds.load("ted_hrlr_translate/pt_to_en", as_supervised=True, as_dataset_kwargs=dict(shuffle_files=True), data_dir=flags.FLAGS.tfds_data_dir) train_en_fr = en_fr.take(dataset_size) train_fr_en = en_fr.skip(dataset_size).take(dataset_size).map(lambda a, b: (b, a)) train_en_de = en_de.take(dataset_size) train_de_en = en_de.skip(dataset_size).take(dataset_size).map(lambda a, b: (b, a)) train_pt_en = pt_en[tfds.Split.TRAIN] train_en_pt = pt_en[tfds.Split.TRAIN].map(lambda a, b: (b, a)) val_pt_en = pt_en[tfds.Split.VALIDATION] train_fr_pt = fr_pt[tfds.Split.TRAIN] train_pt_fr = fr_pt[tfds.Split.TRAIN].map(lambda a, b: (b, a)) val_fr_pt = fr_pt[tfds.Split.VALIDATION] with tf.io.gfile.GFile(flags.FLAGS.encoding_model_file, "rb") as f: encoding_model_proto = f.read() train_en_fr = train_en_fr.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) train_fr_en = train_fr_en.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) train_en_de = train_en_de.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) train_de_en = train_de_en.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) train_pt_en = train_pt_en.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) train_en_pt = train_en_pt.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) val_pt_en = val_pt_en.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) train_pt_fr = train_pt_fr.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) train_fr_pt = train_fr_pt.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) val_fr_pt = val_fr_pt.map(lambda a, b: (tfs.encode(tf.expand_dims(a, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0], tfs.encode(tf.expand_dims(b, 0), model_proto=encoding_model_proto, add_bos=True, add_eos=True)[0][0])) train_en_fr = train_en_fr.filter(lambda a, b: tf.logical_and( tf.size(a) < (flags.FLAGS.seq_len + 3), tf.size(b) < (flags.FLAGS.seq_len + 3))) train_fr_en = train_fr_en.filter(lambda a, b: tf.logical_and( tf.size(a) < (flags.FLAGS.seq_len + 3), tf.size(b) < (flags.FLAGS.seq_len + 3))) train_en_de = train_en_de.filter(lambda a, b: tf.logical_and( tf.size(a) < (flags.FLAGS.seq_len + 3), tf.size(b) < (flags.FLAGS.seq_len + 3))) train_de_en = train_de_en.filter(lambda a, b: tf.logical_and( tf.size(a) < (flags.FLAGS.seq_len + 3), tf.size(b) < (flags.FLAGS.seq_len + 3))) train_fr_pt = train_fr_pt.filter(lambda a, b: tf.logical_and( tf.size(a) < (flags.FLAGS.seq_len + 3), tf.size(b) < (flags.FLAGS.seq_len + 3))) train_pt_fr = train_pt_fr.filter(lambda a, b: tf.logical_and( tf.size(a) < (flags.FLAGS.seq_len + 3), tf.size(b) < (flags.FLAGS.seq_len + 3))) train_en_pt = train_en_pt.filter(lambda a, b: tf.logical_and( tf.size(a) < (flags.FLAGS.seq_len + 3), tf.size(b) < (flags.FLAGS.seq_len + 3))) train_pt_en = train_pt_en.filter(lambda a, b: tf.logical_and( tf.size(a) < (flags.FLAGS.seq_len + 3), tf.size(b) < (flags.FLAGS.seq_len + 3))) # en: 0, fr: 1, de: 2, pt: 3 train_en_fr = train_en_fr.map(lambda a, b: ( (a, [1.0, 0.0, 0.0, 0.0], b[:-1], [0.0, 1.0, 0.0, 0.0]), b[1:])) train_fr_en = train_fr_en.map(lambda a, b: ( (a, [0.0, 1.0, 0.0, 0.0], b[:-1], [1.0, 0.0, 0.0, 0.0]), b[1:])) train_en_de = train_en_de.map(lambda a, b: ( (a, [1.0, 0.0, 0.0, 0.0], b[:-1], [0.0, 0.0, 1.0, 0.0]), b[1:])) train_de_en = train_de_en.map(lambda a, b: ( (a, [0.0, 0.0, 1.0, 0.0], b[:-1], [1.0, 0.0, 0.0, 0.0]), b[1:])) train_fr_pt = train_fr_pt.map(lambda a, b: ( (a, [0.0, 1.0, 0.0, 0.0], b[:-1], [0.0, 0.0, 0.0, 1.0]), b[1:])) train_pt_fr = train_pt_fr.map(lambda a, b: ( (a, [0.0, 0.0, 0.0, 1.0], b[:-1], [0.0, 1.0, 0.0, 0.0]), b[1:])) train_en_pt = train_en_pt.map(lambda a, b: ( (a, [1.0, 0.0, 0.0, 0.0], b[:-1], [0.0, 0.0, 0.0, 1.0]), b[1:])) train_pt_en = train_pt_en.map(lambda a, b: ( (a, [0.0, 0.0, 0.0, 1.0], b[:-1], [1.0, 0.0, 0.0, 0.0]), b[1:])) val_fr_pt = val_fr_pt.map(lambda a, b: ( (a, [0.0, 1.0, 0.0, 0.0], b[:-1], [0.0, 0.0, 0.0, 1.0]), b[1:])) val_pt_en = val_pt_en.map(lambda a, b: ( (a, [0.0, 0.0, 0.0, 1.0], b[:-1], [1.0, 0.0, 0.0, 0.0]), b[1:])) train_data = train_en_fr.concatenate(train_fr_en).concatenate( train_en_de).concatenate(train_de_en).concatenate( train_fr_pt).concatenate(train_pt_fr).concatenate( train_en_pt).concatenate(train_pt_en) val_data = val_fr_pt.concatenate(val_pt_en) train_data = train_data.cache() train_data = train_data.shuffle(flags.FLAGS.shuffle_buffer_size) train_data = train_data.padded_batch(batch_size, padded_shapes=(((-1, ), (-1, ), (-1, ), (-1, )), (-1, ))) train_data = train_data.prefetch(tf.data.experimental.AUTOTUNE) train_data = train_data.repeat() val_data = val_data.padded_batch(batch_size, padded_shapes=(((-1, ), (-1, ), (-1, ), (-1, )), (-1, ))) return train_data, val_data
def testInvalidInput(self): sentences = ['Hello world.', 'This is a test.'] ids = [[0, 1], [2, 3]] model_file = self._getSentencePieceModelFile() with tf.Session() as sess: a = tf.constant(sentences) b = tf.constant(ids) alpha = tf.constant([1.0, 2.0]) sess.run( tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo')) nbest_size = tf.constant([1, 2], dtype=tf.int32) sess.run( tfspm.encode(a, model_file=model_file, nbest_size=nbest_size, name='foo')) alpha = tf.constant(1.0) sess.run( tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo')) nbest_size = tf.constant(10, dtype=tf.int32) sess.run( tfspm.encode(a, model_file=model_file, nbest_size=nbest_size, name='foo')) sess.run( tfspm.decode(b, sequence_length=tf.constant([2, 2]), model_file=model_file)) with self.assertRaises(ValueError): a = tf.constant(sentences) alpha = tf.constant([1.0, 2.0, 3.0]) sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha)) with self.assertRaises(ValueError): a = tf.constant(sentences) nbest_size = tf.constant([1, 2, 3], dtype=tf.int32) sess.run( tfspm.encode(a, model_file=model_file, nbest_size=nbest_size)) with self.assertRaises(ValueError): a = tf.constant(sentences) alpha = tf.constant([[1.0], [2.0]]) sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha)) with self.assertRaises(ValueError): a = tf.constant(sentences) nbest_size = tf.constant([[1], [2]], dtype=tf.int32) sess.run( tfspm.encode(a, model_file=model_file, nbest_size=nbest_size)) with self.assertRaises(ValueError): b = tf.constant(ids) sess.run( tfspm.decode(a, sequence_length=2, model_file=model_file)) with self.assertRaises(ValueError): b = tf.constant(ids) sess.run( tfspm.decode(a, sequence_length=tf.constant([2, 2, 2]), model_file=model_file))
def predict_batch(sess, src, model, src_model_file, tar_model_file, src_offset, tar_offset, srcf, tarf, vocab_size, single_vocab_size=8192, batch_size=60): """ sess: tf.Session src: list of strings model: tf.keras.Model """ t = len(src) ans = [] for i in range(t // batch_size): print(i) start = i * batch_size end = start + batch_size inp = src[start:end] a = tfs.encode(inp, model_file=src_model_file, add_bos=True, add_eos=True)[0] if src_offset > 0: a_mask = tf.cast(tf.not_equal(a, 0), tf.int32) * src_offset a = a + a_mask ids, probs = predict( model=model, inputs=a, inpf=tf.constant(srcf), tarf=tf.constant(tarf), bos_id=tar_offset + 1, eos_id=tar_offset + 2, beam_size=5, vocab_size=vocab_size, alpha=1.0, ) mask = tf.cast(tf.not_equal(ids, 0), tf.int32) seq_len = tf.reduce_sum(mask, axis=-1) if tar_offset > 0: ids = ids + mask * -tar_offset probs = tf.math.exp(probs) ids_, seq_len_ = sess.run([ids, seq_len]) for cids, cseqlen in zip(list(ids_), list(seq_len_)): fids = tf.cast( tf.logical_and(tf.greater(cids, 0), tf.less(cids, single_vocab_size)), tf.int32) * cids decoded = sess.run( tfs.decode(fids, cseqlen, model_file=tar_model_file)) ans.append(decoded) return ans