Пример #1
0
 def testStrToVocabTokenAppendEOSFalse(self):
   vocab = test_helper.test_src_dir_path('core/ops/testdata/test_vocab.txt')
   with self.session(use_gpu=False) as sess:
     token_ids, target_ids, paddings = sess.run(
         py_x_ops.str_to_vocab_tokens(
             [
                 'a b c d e',
                 '<epsilon> <S> </S> <UNK>',
                 'øut über ♣ 愤青 ←',
             ],
             append_eos=False,
             maxlen=10,
             vocab_filepath=vocab))
     self.assertEqual(
         token_ids.tolist(),
         [[1, 5, 6, 7, 8, 9, 2, 2, 2, 2], [1, 0, 1, 2, 3, 2, 2, 2, 2, 2],
          [1, 10, 11, 12, 13, 3, 2, 2, 2, 2]])
     self.assertEqual(
         target_ids.tolist(),
         [[5, 6, 7, 8, 9, 2, 2, 2, 2, 2], [0, 1, 2, 3, 2, 2, 2, 2, 2, 2],
          [10, 11, 12, 13, 3, 2, 2, 2, 2, 2]])
     self.assertEqual(paddings.tolist(),
                      [[0., 0., 0., 0., 0., 1., 1., 1., 1., 1.], [
                          0., 0., 0., 0., 1., 1., 1., 1., 1., 1.
                      ], [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.]])
Пример #2
0
 def testStrToVocabTokenTruncates(self):
   vocab = test_helper.test_src_dir_path('core/ops/testdata/test_vocab.txt')
   with self.session(use_gpu=False) as sess:
     token_ids, target_ids, paddings = sess.run(
         py_x_ops.str_to_vocab_tokens(['a b c d e ' * 1000],
                                      append_eos=True,
                                      maxlen=5,
                                      vocab_filepath=vocab))
     self.assertEqual(token_ids.tolist(), [[1, 5, 6, 7, 8]])
     self.assertEqual(target_ids.tolist(), [[5, 6, 7, 8, 9]])
     self.assertEqual(paddings.tolist(), [[0., 0., 0., 0., 0.]])
Пример #3
0
 def testStrToVocabTokenSplitToCharacters(self):
   custom_delimiter = ''
   vocab = test_helper.test_src_dir_path('core/ops/testdata/test_vocab.txt')
   with self.session(use_gpu=False) as sess:
     token_ids, target_ids, paddings = sess.run(
         py_x_ops.str_to_vocab_tokens(['abcde'],
                                      append_eos=True,
                                      maxlen=8,
                                      vocab_filepath=vocab,
                                      delimiter=custom_delimiter))
     self.assertEqual(token_ids.tolist(), [[1, 5, 6, 7, 8, 9, 2, 2]])
     self.assertEqual(target_ids.tolist(), [[5, 6, 7, 8, 9, 2, 2, 2]])
     self.assertEqual(paddings.tolist(), [[0., 0., 0., 0., 0., 0., 1., 1.]])
Пример #4
0
    def _StringsToIdsImpl(self, strs, max_length, append_eos):
        self._CheckParams()
        p = self.params

        if p.token_vocab_filepath:
            return py_x_ops.str_to_vocab_tokens(
                strs,
                maxlen=max_length,
                append_eos=append_eos,
                vocab_filepath=p.token_vocab_filepath,
                delimiter=p.tokens_delimiter)
        elif p.ngram_vocab_filepath:
            raise NotImplementedError(
                'ngram vocab StringsToIds is not supported.')