Exemplo n.º 1
0
    def test_text_to_tokenid_with_vocab_file(self):
        ''' test label to token id'''
        with self.cached_session(use_gpu=False, force_gpu=False) as sess:
            # test batch
            start = time.time()
            batch_op = py_x_ops.sentence_to_ids(
                ['hello world', '你好 hello unknown  world'],
                maxlen=10,
                use_vocab_file=True,
                vocab_filepath=self.vocab_filepath,
                load_token_ids_from_vocab=False,
                pad_id=-1)
            token_ids, paddings = sess.run(batch_op)
            elapsed = time.time() - start
            logging.info("Time cost: {:.4f}s".format(elapsed))
            logging.info(token_ids)
            logging.info(paddings)
            logging.info("batch_op: {}".format(batch_op))
            self.assertAllEqual(token_ids,
                                [[2, 4, -1, -1, -1, -1, -1, -1, -1, -1],
                                 [3, 2, 1, 4, -1, -1, -1, -1, -1, -1]])
            self.assertAllEqual(paddings, [[0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
                                           [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])

            # test single
            single_op = py_x_ops.sentence_to_ids(
                '你好 hello unknown  world',
                maxlen=10,
                vocab_filepath=self.vocab_filepath,
                use_vocab_file=True,
                load_token_ids_from_vocab=False,
                pad_id=-1)
            token_ids, paddings = sess.run(single_op)
            logging.info("single_op: {}".format(single_op))
            self.assertAllEqual(token_ids,
                                [3, 2, 1, 4, -1, -1, -1, -1, -1, -1])

            # test short single
            short_single_op = py_x_ops.sentence_to_ids(
                '你好 hello unknown  world',
                maxlen=2,
                use_vocab_file=True,
                vocab_filepath=self.vocab_filepath,
                load_token_ids_from_vocab=False,
                pad_id=0)
            token_ids, paddings = sess.run(short_single_op)
            logging.info("short_op: {}".format(short_single_op))
            self.assertAllEqual(token_ids, [3, 2])

            # test short batch
            short_batch_op = py_x_ops.sentence_to_ids(
                ['hello world', '你好 hello unknown  world'],
                maxlen=2,
                use_vocab_file=True,
                vocab_filepath=self.vocab_filepath,
                load_token_ids_from_vocab=False,
                pad_id=0)
            token_ids, paddings = sess.run(short_batch_op)
            logging.info("short_op: {}".format(short_batch_op))
            self.assertAllEqual(token_ids, [[2, 4], [3, 2]])
Exemplo n.º 2
0
def tokenize_sentence(texts, max_seq_len, vocab_path):
    """Tokenize sentence"""
    token_ids, _ = py_x_ops.sentence_to_ids(texts,
                                            maxlen=max_seq_len,
                                            vocab_filepath=vocab_path,
                                            load_token_ids_from_vocab=True,
                                            pad_id=utils.PAD_IDX)
    return token_ids
Exemplo n.º 3
0
def tokenize_label(label, maxlen, label_vocab_file_path, pad_id):
    """Tokenize labels"""
    label_id, _ = py_x_ops.sentence_to_ids(
        label,
        maxlen=maxlen,
        vocab_filepath=label_vocab_file_path,
        load_token_ids_from_vocab=True,
        pad_id=pad_id,
        check_tokens=False)
    return label_id
Exemplo n.º 4
0
def tokenize_sentence(texts, max_seq_len, vocab_path):
    """Tokenize sentence"""
    vocabs = read_lines_from_text_file(vocab_path)
    token_ids, _ = py_x_ops.sentence_to_ids(texts,
                                            maxlen=max_seq_len,
                                            use_vocab_file=False,
                                            vocab=vocabs,
                                            load_token_ids_from_vocab=True,
                                            pad_id=utils.PAD_IDX,
                                            check_tokens=False)
    return token_ids
Exemplo n.º 5
0
def tokenize_label(label, maxlen, label_vocab_file_path, pad_id):
    """Tokenize labels"""
    vocabs = read_lines_from_text_file(label_vocab_file_path)
    label_id, _ = py_x_ops.sentence_to_ids(label,
                                           maxlen=maxlen,
                                           use_vocab_file=False,
                                           vocab=vocabs,
                                           load_token_ids_from_vocab=True,
                                           pad_id=pad_id,
                                           check_tokens=False)
    return label_id