Python encode示例，tf_sentencepiece.encode Python示例

示例#1

0

显示文件

文件： tf_sentencepiece_test.py 项目： rchatterjee/sentencepiece

    def testSampleEncodeAndDecode(self):
        sentencepiece_model_file = self._getSentencePieceModelFile()
        processor = spm.SentencePieceProcessor()
        processor.Load(sentencepiece_model_file)
        sentences, _, _, _ = self._getExpected(processor)

        with tf.Session():
            for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]:
                # Round trip test.
                nbest_size = tf.constant(n)
                alpha = tf.constant(a)
                s = tf.constant(sentences)

                pieces, seq_len1 = tfspm.encode(
                    s,
                    nbest_size=nbest_size,
                    alpha=alpha,
                    model_file=sentencepiece_model_file,
                    out_type=tf.string)
                ids, seq_len2 = tfspm.encode(
                    s,
                    nbest_size=nbest_size,
                    alpha=alpha,
                    model_file=sentencepiece_model_file)
                decoded_sentences1 = tfspm.decode(
                    pieces, seq_len1, model_file=sentencepiece_model_file)
                decoded_sentences2 = tfspm.decode(
                    ids, seq_len2, model_file=sentencepiece_model_file)

                self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
                self.assertEqual(decoded_sentences2.eval().tolist(), sentences)

示例#2

0

显示文件

文件： one_to_many.py 项目： suyash/mlt

def encode_sentencepiece(dataset, a_model_proto, b_model_proto, a_offset,
                         b_offset):
    return dataset.map(lambda a, b: (
        tfs.encode(tf.expand_dims(a, 0),
                   model_proto=a_model_proto,
                   add_bos=True,
                   add_eos=True)[0][0] + a_offset,
        tfs.encode(tf.expand_dims(b, 0),
                   model_proto=b_model_proto,
                   add_bos=True,
                   add_eos=True)[0][0] + b_offset,
    ))

示例#3

0

显示文件

文件： tf_sentencepiece_test.py 项目： rchatterjee/sentencepiece

    def testEncodeAndDecode(self):
        sentencepiece_model_file = self._getSentencePieceModelFile()
        processor = spm.SentencePieceProcessor()
        processor.Load(sentencepiece_model_file)

        with tf.Session():
            for reverse, add_bos, add_eos in list(
                    it.product((True, False), repeat=3)):
                (sentences, expected_pieces, expected_ids,
                 expected_seq_len) = self._getExpected(processor, reverse,
                                                       add_bos, add_eos)

                # Encode sentences into pieces/ids.
                s = tf.constant(sentences)
                pieces, seq_len1 = tfspm.encode(
                    s,
                    model_file=sentencepiece_model_file,
                    reverse=reverse,
                    add_bos=add_bos,
                    add_eos=add_eos,
                    out_type=tf.string)
                ids, seq_len2 = tfspm.encode(
                    s,
                    model_file=sentencepiece_model_file,
                    reverse=reverse,
                    add_bos=add_bos,
                    add_eos=add_eos)

                self.assertEqual(pieces.eval().tolist(), expected_pieces)
                self.assertEqual(ids.eval().tolist(), expected_ids)
                self.assertEqual(seq_len1.eval().tolist(), expected_seq_len)
                self.assertEqual(seq_len2.eval().tolist(), expected_seq_len)

                # Decode pieces into sentences/ids.
                pieces = tf.constant(expected_pieces)
                ids = tf.constant(expected_ids)
                seq_len = tf.constant(expected_seq_len, dtype=tf.int32)
                decoded_sentences1 = tfspm.decode(
                    pieces,
                    seq_len,
                    model_file=sentencepiece_model_file,
                    reverse=reverse)
                decoded_sentences2 = tfspm.decode(
                    ids,
                    seq_len,
                    model_file=sentencepiece_model_file,
                    reverse=reverse)

                self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
                self.assertEqual(decoded_sentences2.eval().tolist(), sentences)

示例#4

0

显示文件

文件： tf_sentencepiece_test.py 项目： kodizant/sentencepiece

 def testInvalidModelProto(self):
   with tf.Session() as sess:
     with self.assertRaises(tf.errors.InternalError):
       sentences = ['Hello world.']
       a = tf.constant(sentences)
       sess.run(tfspm.encode(
           a, model_proto='invalid proto', out_type=tf.string))

示例#5

0

显示文件

文件： tf_sentencepiece_test.py 项目： kodizant/sentencepiece

 def testInvalidModelPath(self):
   with tf.Session() as sess:
     with self.assertRaises(tf.errors.NotFoundError):
       sentences = ['Hello world.']
       a = tf.constant(sentences)
       sess.run(tfspm.encode(
           a, model_file='invalid path', out_type=tf.string))

示例#6

0

显示文件

文件： tf_sentencepiece_test.py 项目： rchatterjee/sentencepiece

 def testLoadModelProto(self):
     # Makes a serialized model proto.
     model_proto = open(self._getSentencePieceModelFile(), 'rb').read()
     with tf.Session() as sess:
         sentences = ['Hello world.']
         a = tf.constant(sentences)
         sess.run(
             tfspm.encode(a, model_proto=model_proto, out_type=tf.string))

示例#7

0

显示文件

文件： tf_sentencepiece_test.py 项目： kodizant/sentencepiece

  def testInvalidInput(self):
    sentences = ['Hello world.', 'This is a test.']
    ids = [[0,1],[2,3]]
    model_file = self._getSentencePieceModelFile()
    with tf.Session() as sess:
      a = tf.constant(sentences)
      b = tf.constant(ids)

      alpha = tf.constant([1.0, 2.0])
      sess.run(tfspm.encode(
          a, model_file=model_file, alpha=alpha, name='foo'))

      nbest_size = tf.constant([1, 2], dtype=tf.int32)
      sess.run(tfspm.encode(
          a, model_file=model_file, nbest_size=nbest_size, name='foo'))

      alpha = tf.constant(1.0)
      sess.run(tfspm.encode(
          a, model_file=model_file, alpha=alpha, name='foo'))

      nbest_size = tf.constant(10, dtype=tf.int32)
      sess.run(tfspm.encode(
          a, model_file=model_file, nbest_size=nbest_size, name='foo'))

      sess.run(tfspm.decode(
          b, sequence_length=tf.constant([2, 2]), model_file=model_file))

      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        alpha = tf.constant([1.0, 2.0, 3.0])
        sess.run(tfspm.encode(
            a, model_file=model_file, alpha=alpha))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        nbest_size = tf.constant([1, 2, 3], dtype=tf.int32)
        sess.run(tfspm.encode(
            a, model_file=model_file, nbest_size=nbest_size))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        alpha = tf.constant([[1.0], [2.0]])
        sess.run(tfspm.encode(
            a, model_file=model_file, alpha=alpha))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        nbest_size = tf.constant([[1], [2]], dtype=tf.int32)
        sess.run(tfspm.encode(
            a, model_file=model_file, nbest_size=nbest_size))
      with self.assertRaises(ValueError):
        b = tf.constant(ids)
        sess.run(tfspm.decode(
            a, sequence_length=2, model_file=model_file))
      with self.assertRaises(ValueError):
        b = tf.constant(ids)
        sess.run(tfspm.decode(
            a, sequence_length=tf.constant([2, 2, 2]),
            model_file=model_file))

示例#8

0

显示文件

文件： tf_sentencepiece_test.py 项目： rchatterjee/sentencepiece

 def testInvalidModelProto(self):
     with tf.Session() as sess:
         with self.assertRaises(tf.errors.InternalError):
             sentences = ['Hello world.']
             a = tf.constant(sentences)
             sess.run(
                 tfspm.encode(a,
                              model_proto='invalid proto',
                              out_type=tf.string))

示例#9

0

显示文件

文件： tf_sentencepiece_test.py 项目： rchatterjee/sentencepiece

 def testInvalidModelPath(self):
     with tf.Session() as sess:
         with self.assertRaises(tf.errors.NotFoundError):
             sentences = ['Hello world.']
             a = tf.constant(sentences)
             sess.run(
                 tfspm.encode(a,
                              model_file='invalid path',
                              out_type=tf.string))

示例#10

0

显示文件

文件： tf_sentencepiece_test.py 项目： kodizant/sentencepiece

 def testLoadModelProto(self):
   # Makes a serialized model proto.
   model_proto = open(self._getSentencePieceModelFile(), 'rb').read()
   with tf.Session() as sess:
     sentences = ['Hello world.']
     a = tf.constant(sentences)
     sess.run(tfspm.encode(
         a, model_proto=model_proto,
         out_type=tf.string))

示例#11

0

显示文件

文件： tf_sentencepiece_test.py 项目： kodizant/sentencepiece

  def testEncodeAndDecode(self):
    sentencepiece_model_file = self._getSentencePieceModelFile()

    with tf.Session():
      for reverse, add_bos, add_eos in list(it.product(
          (True, False), repeat=3)):
        (sentences, expected_pieces,
         expected_ids, expected_seq_len) = self._getExpected(
             reverse=reverse, add_bos=add_bos, add_eos=add_eos)

        # Encode sentences into pieces/ids.
        s = tf.constant(sentences)
        pieces, seq_len1 = tfspm.encode(
            s, model_file=sentencepiece_model_file,
            reverse=reverse, add_bos=add_bos, add_eos=add_eos,
            out_type=tf.string)
        ids, seq_len2 = tfspm.encode(
            s, model_file=sentencepiece_model_file,
            reverse=reverse, add_bos=add_bos, add_eos=add_eos)

        self.assertEqual(pieces.eval().tolist(), expected_pieces)
        self.assertEqual(ids.eval().tolist(), expected_ids)
        self.assertEqual(seq_len1.eval().tolist(), expected_seq_len)
        self.assertEqual(seq_len2.eval().tolist(), expected_seq_len)

        # Decode pieces into sentences/ids.
        pieces = tf.constant(expected_pieces)
        ids = tf.constant(expected_ids)
        seq_len = tf.constant(expected_seq_len, dtype=tf.int32)
        decoded_sentences1 = tfspm.decode(
            pieces, seq_len, model_file=sentencepiece_model_file,
            reverse=reverse)
        decoded_sentences2 = tfspm.decode(
            ids, seq_len, model_file=sentencepiece_model_file,
            reverse=reverse)

        self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
        self.assertEqual(decoded_sentences2.eval().tolist(), sentences)

示例#12

0

显示文件

文件： tf_sentencepiece_test.py 项目： kodizant/sentencepiece

  def testSampleEncodeAndDecode(self):
    sentencepiece_model_file = self._getSentencePieceModelFile()
    sentences, _, _, _ = self._getExpected()

    with tf.Session():
      for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]:
        # Round trip test.
        nbest_size = tf.constant(n)
        alpha = tf.constant(a)
        s = tf.constant(sentences)

        pieces, seq_len1 = tfspm.encode(
            s, nbest_size=nbest_size, alpha=alpha,
            model_file=sentencepiece_model_file, out_type=tf.string)
        ids, seq_len2 = tfspm.encode(
            s, nbest_size=nbest_size, alpha=alpha,
            model_file=sentencepiece_model_file)
        decoded_sentences1 = tfspm.decode(
            pieces, seq_len1, model_file=sentencepiece_model_file)
        decoded_sentences2 = tfspm.decode(
            ids, seq_len2, model_file=sentencepiece_model_file)

        self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
        self.assertEqual(decoded_sentences2.eval().tolist(), sentences)

示例#13

0

显示文件

文件： many_to_many_fine_tune.py 项目： suyash/mlt

def prepare_datasets(batch_size, dataset_size=25000):
    en_fr = tfds.load("para_crawl/enfr_plain_text",
                      as_supervised=True,
                      split=tfds.Split.TRAIN,
                      as_dataset_kwargs=dict(shuffle_files=True),
                      data_dir=flags.FLAGS.tfds_data_dir)
    en_de = tfds.load("para_crawl/ende_plain_text",
                      as_supervised=True,
                      split=tfds.Split.TRAIN,
                      as_dataset_kwargs=dict(shuffle_files=True),
                      data_dir=flags.FLAGS.tfds_data_dir)

    fr_pt = tfds.load("ted_hrlr_translate/fr_to_pt",
                      as_supervised=True,
                      as_dataset_kwargs=dict(shuffle_files=True),
                      data_dir=flags.FLAGS.tfds_data_dir)
    pt_en = tfds.load("ted_hrlr_translate/pt_to_en",
                      as_supervised=True,
                      as_dataset_kwargs=dict(shuffle_files=True),
                      data_dir=flags.FLAGS.tfds_data_dir)

    train_en_fr = en_fr.take(dataset_size)
    train_fr_en = en_fr.skip(dataset_size).take(dataset_size).map(lambda a, b:
                                                                  (b, a))

    train_en_de = en_de.take(dataset_size)
    train_de_en = en_de.skip(dataset_size).take(dataset_size).map(lambda a, b:
                                                                  (b, a))

    train_pt_en = pt_en[tfds.Split.TRAIN]
    train_en_pt = pt_en[tfds.Split.TRAIN].map(lambda a, b: (b, a))

    val_pt_en = pt_en[tfds.Split.VALIDATION]

    train_fr_pt = fr_pt[tfds.Split.TRAIN]
    train_pt_fr = fr_pt[tfds.Split.TRAIN].map(lambda a, b: (b, a))

    val_fr_pt = fr_pt[tfds.Split.VALIDATION]

    with tf.io.gfile.GFile(flags.FLAGS.encoding_model_file, "rb") as f:
        encoding_model_proto = f.read()

    train_en_fr = train_en_fr.map(lambda a, b:
                                  (tfs.encode(tf.expand_dims(a, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0],
                                   tfs.encode(tf.expand_dims(b, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0]))

    train_fr_en = train_fr_en.map(lambda a, b:
                                  (tfs.encode(tf.expand_dims(a, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0],
                                   tfs.encode(tf.expand_dims(b, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0]))

    train_en_de = train_en_de.map(lambda a, b:
                                  (tfs.encode(tf.expand_dims(a, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0],
                                   tfs.encode(tf.expand_dims(b, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0]))

    train_de_en = train_de_en.map(lambda a, b:
                                  (tfs.encode(tf.expand_dims(a, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0],
                                   tfs.encode(tf.expand_dims(b, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0]))

    train_pt_en = train_pt_en.map(lambda a, b:
                                  (tfs.encode(tf.expand_dims(a, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0],
                                   tfs.encode(tf.expand_dims(b, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0]))

    train_en_pt = train_en_pt.map(lambda a, b:
                                  (tfs.encode(tf.expand_dims(a, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0],
                                   tfs.encode(tf.expand_dims(b, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0]))

    val_pt_en = val_pt_en.map(lambda a, b:
                              (tfs.encode(tf.expand_dims(a, 0),
                                          model_proto=encoding_model_proto,
                                          add_bos=True,
                                          add_eos=True)[0][0],
                               tfs.encode(tf.expand_dims(b, 0),
                                          model_proto=encoding_model_proto,
                                          add_bos=True,
                                          add_eos=True)[0][0]))

    train_pt_fr = train_pt_fr.map(lambda a, b:
                                  (tfs.encode(tf.expand_dims(a, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0],
                                   tfs.encode(tf.expand_dims(b, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0]))

    train_fr_pt = train_fr_pt.map(lambda a, b:
                                  (tfs.encode(tf.expand_dims(a, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0],
                                   tfs.encode(tf.expand_dims(b, 0),
                                              model_proto=encoding_model_proto,
                                              add_bos=True,
                                              add_eos=True)[0][0]))

    val_fr_pt = val_fr_pt.map(lambda a, b:
                              (tfs.encode(tf.expand_dims(a, 0),
                                          model_proto=encoding_model_proto,
                                          add_bos=True,
                                          add_eos=True)[0][0],
                               tfs.encode(tf.expand_dims(b, 0),
                                          model_proto=encoding_model_proto,
                                          add_bos=True,
                                          add_eos=True)[0][0]))

    train_en_fr = train_en_fr.filter(lambda a, b: tf.logical_and(
        tf.size(a) < (flags.FLAGS.seq_len + 3),
        tf.size(b) < (flags.FLAGS.seq_len + 3)))

    train_fr_en = train_fr_en.filter(lambda a, b: tf.logical_and(
        tf.size(a) < (flags.FLAGS.seq_len + 3),
        tf.size(b) < (flags.FLAGS.seq_len + 3)))

    train_en_de = train_en_de.filter(lambda a, b: tf.logical_and(
        tf.size(a) < (flags.FLAGS.seq_len + 3),
        tf.size(b) < (flags.FLAGS.seq_len + 3)))

    train_de_en = train_de_en.filter(lambda a, b: tf.logical_and(
        tf.size(a) < (flags.FLAGS.seq_len + 3),
        tf.size(b) < (flags.FLAGS.seq_len + 3)))

    train_fr_pt = train_fr_pt.filter(lambda a, b: tf.logical_and(
        tf.size(a) < (flags.FLAGS.seq_len + 3),
        tf.size(b) < (flags.FLAGS.seq_len + 3)))

    train_pt_fr = train_pt_fr.filter(lambda a, b: tf.logical_and(
        tf.size(a) < (flags.FLAGS.seq_len + 3),
        tf.size(b) < (flags.FLAGS.seq_len + 3)))

    train_en_pt = train_en_pt.filter(lambda a, b: tf.logical_and(
        tf.size(a) < (flags.FLAGS.seq_len + 3),
        tf.size(b) < (flags.FLAGS.seq_len + 3)))

    train_pt_en = train_pt_en.filter(lambda a, b: tf.logical_and(
        tf.size(a) < (flags.FLAGS.seq_len + 3),
        tf.size(b) < (flags.FLAGS.seq_len + 3)))

    # en: 0, fr: 1, de: 2, pt: 3

    train_en_fr = train_en_fr.map(lambda a, b: (
        (a, [1.0, 0.0, 0.0, 0.0], b[:-1], [0.0, 1.0, 0.0, 0.0]), b[1:]))
    train_fr_en = train_fr_en.map(lambda a, b: (
        (a, [0.0, 1.0, 0.0, 0.0], b[:-1], [1.0, 0.0, 0.0, 0.0]), b[1:]))

    train_en_de = train_en_de.map(lambda a, b: (
        (a, [1.0, 0.0, 0.0, 0.0], b[:-1], [0.0, 0.0, 1.0, 0.0]), b[1:]))
    train_de_en = train_de_en.map(lambda a, b: (
        (a, [0.0, 0.0, 1.0, 0.0], b[:-1], [1.0, 0.0, 0.0, 0.0]), b[1:]))

    train_fr_pt = train_fr_pt.map(lambda a, b: (
        (a, [0.0, 1.0, 0.0, 0.0], b[:-1], [0.0, 0.0, 0.0, 1.0]), b[1:]))
    train_pt_fr = train_pt_fr.map(lambda a, b: (
        (a, [0.0, 0.0, 0.0, 1.0], b[:-1], [0.0, 1.0, 0.0, 0.0]), b[1:]))

    train_en_pt = train_en_pt.map(lambda a, b: (
        (a, [1.0, 0.0, 0.0, 0.0], b[:-1], [0.0, 0.0, 0.0, 1.0]), b[1:]))
    train_pt_en = train_pt_en.map(lambda a, b: (
        (a, [0.0, 0.0, 0.0, 1.0], b[:-1], [1.0, 0.0, 0.0, 0.0]), b[1:]))

    val_fr_pt = val_fr_pt.map(lambda a, b: (
        (a, [0.0, 1.0, 0.0, 0.0], b[:-1], [0.0, 0.0, 0.0, 1.0]), b[1:]))

    val_pt_en = val_pt_en.map(lambda a, b: (
        (a, [0.0, 0.0, 0.0, 1.0], b[:-1], [1.0, 0.0, 0.0, 0.0]), b[1:]))

    train_data = train_en_fr.concatenate(train_fr_en).concatenate(
        train_en_de).concatenate(train_de_en).concatenate(
            train_fr_pt).concatenate(train_pt_fr).concatenate(
                train_en_pt).concatenate(train_pt_en)

    val_data = val_fr_pt.concatenate(val_pt_en)

    train_data = train_data.cache()
    train_data = train_data.shuffle(flags.FLAGS.shuffle_buffer_size)
    train_data = train_data.padded_batch(batch_size,
                                         padded_shapes=(((-1, ), (-1, ),
                                                         (-1, ), (-1, )),
                                                        (-1, )))
    train_data = train_data.prefetch(tf.data.experimental.AUTOTUNE)
    train_data = train_data.repeat()

    val_data = val_data.padded_batch(batch_size,
                                     padded_shapes=(((-1, ), (-1, ), (-1, ),
                                                     (-1, )), (-1, )))

    return train_data, val_data

示例#14

0

显示文件

文件： tf_sentencepiece_test.py 项目： rchatterjee/sentencepiece

    def testInvalidInput(self):
        sentences = ['Hello world.', 'This is a test.']
        ids = [[0, 1], [2, 3]]
        model_file = self._getSentencePieceModelFile()
        with tf.Session() as sess:
            a = tf.constant(sentences)
            b = tf.constant(ids)

            alpha = tf.constant([1.0, 2.0])
            sess.run(
                tfspm.encode(a, model_file=model_file, alpha=alpha,
                             name='foo'))

            nbest_size = tf.constant([1, 2], dtype=tf.int32)
            sess.run(
                tfspm.encode(a,
                             model_file=model_file,
                             nbest_size=nbest_size,
                             name='foo'))

            alpha = tf.constant(1.0)
            sess.run(
                tfspm.encode(a, model_file=model_file, alpha=alpha,
                             name='foo'))

            nbest_size = tf.constant(10, dtype=tf.int32)
            sess.run(
                tfspm.encode(a,
                             model_file=model_file,
                             nbest_size=nbest_size,
                             name='foo'))

            sess.run(
                tfspm.decode(b,
                             sequence_length=tf.constant([2, 2]),
                             model_file=model_file))

            with self.assertRaises(ValueError):
                a = tf.constant(sentences)
                alpha = tf.constant([1.0, 2.0, 3.0])
                sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha))
            with self.assertRaises(ValueError):
                a = tf.constant(sentences)
                nbest_size = tf.constant([1, 2, 3], dtype=tf.int32)
                sess.run(
                    tfspm.encode(a,
                                 model_file=model_file,
                                 nbest_size=nbest_size))
            with self.assertRaises(ValueError):
                a = tf.constant(sentences)
                alpha = tf.constant([[1.0], [2.0]])
                sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha))
            with self.assertRaises(ValueError):
                a = tf.constant(sentences)
                nbest_size = tf.constant([[1], [2]], dtype=tf.int32)
                sess.run(
                    tfspm.encode(a,
                                 model_file=model_file,
                                 nbest_size=nbest_size))
            with self.assertRaises(ValueError):
                b = tf.constant(ids)
                sess.run(
                    tfspm.decode(a, sequence_length=2, model_file=model_file))
            with self.assertRaises(ValueError):
                b = tf.constant(ids)
                sess.run(
                    tfspm.decode(a,
                                 sequence_length=tf.constant([2, 2, 2]),
                                 model_file=model_file))

示例#15

0

显示文件

文件： evaluation.py 项目： suyash/mlt

def predict_batch(sess,
                  src,
                  model,
                  src_model_file,
                  tar_model_file,
                  src_offset,
                  tar_offset,
                  srcf,
                  tarf,
                  vocab_size,
                  single_vocab_size=8192,
                  batch_size=60):
    """
    sess: tf.Session
    src: list of strings
    model: tf.keras.Model
    """
    t = len(src)

    ans = []

    for i in range(t // batch_size):
        print(i)

        start = i * batch_size
        end = start + batch_size
        inp = src[start:end]

        a = tfs.encode(inp,
                       model_file=src_model_file,
                       add_bos=True,
                       add_eos=True)[0]

        if src_offset > 0:
            a_mask = tf.cast(tf.not_equal(a, 0), tf.int32) * src_offset
            a = a + a_mask

        ids, probs = predict(
            model=model,
            inputs=a,
            inpf=tf.constant(srcf),
            tarf=tf.constant(tarf),
            bos_id=tar_offset + 1,
            eos_id=tar_offset + 2,
            beam_size=5,
            vocab_size=vocab_size,
            alpha=1.0,
        )

        mask = tf.cast(tf.not_equal(ids, 0), tf.int32)
        seq_len = tf.reduce_sum(mask, axis=-1)

        if tar_offset > 0:
            ids = ids + mask * -tar_offset

        probs = tf.math.exp(probs)

        ids_, seq_len_ = sess.run([ids, seq_len])

        for cids, cseqlen in zip(list(ids_), list(seq_len_)):
            fids = tf.cast(
                tf.logical_and(tf.greater(cids, 0),
                               tf.less(cids, single_vocab_size)),
                tf.int32) * cids
            decoded = sess.run(
                tfs.decode(fids, cseqlen, model_file=tar_model_file))
            ans.append(decoded)

    return ans