Python Tokenizer.encode_ids示例

编程语言: Python

命名空间/包名称: seg_utils

类/类型: Tokenizer

方法/功能: encode_ids

hotexamples.com的示例: 2

Python Tokenizer.encode_ids - 已找到2个示例。这些是从开源项目中提取的最受好评的seg_utils.Tokenizer.encode_ids现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Tokenizer(10)

select_important_tokens(4)

tokenize(3)

encode_ids(2)

cal_weight_idf(1)

cut(1)

示例#1

显示文件

class query_weight:
    def __init__(self, ckpt_num=156000, is_training=False):
        #init_log()
        batch_size = 1
        logging.info("Init query weight model ...")
        self.sp = Tokenizer()
        self.lm = language_model()
        self.xgb_model = xgb.Booster(model_file=conf.rank_model)
        tf.logging.set_verbosity(tf.logging.INFO)
        tf_float = tf.bfloat16 if FLAGS.use_bfloat16 else tf.float32
        self.input_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[batch_size, FLAGS.seq_len],
                                        name="input_ids")
        self.segment_ids = tf.placeholder(dtype=tf.int32,
                                          shape=[batch_size, FLAGS.seq_len],
                                          name="segment_ids")
        self.input_mask = tf.placeholder(dtype=tf_float,
                                         shape=[batch_size, FLAGS.seq_len],
                                         name="input_mask")
        self.label_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[batch_size],
                                        name="label_ids")
        inp = tf.transpose(self.input_ids, [1, 0])
        seg_id = tf.transpose(self.segment_ids, [1, 0])
        inp_mask = tf.transpose(self.input_mask, [1, 0])
        self.sess = tf.Session()
        xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
        run_config = xlnet.create_run_config(is_training, True, FLAGS)

        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=inp,
                                       seg_ids=seg_id,
                                       input_mask=inp_mask)
        self.output, self.attn_prob, self.attention_out = xlnet_model.output_encode, xlnet_model.attn_prob, xlnet_model.attention_out

        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))
        xlnet_model.saver.restore(
            self.sess, FLAGS.init_checkpoint + "/model.ckpt-" + str(ckpt_num))
        #### load pretrained models
        # scaffold_fn = model_utils.init_from_checkpoint(FLAGS)
        logging.info("Init query weight model finished ...")

    def run(self, req_dict):
        result = None
        try:
            query = req_dict["request"]["p"]["query"]
            result = self.run_step(query)
        except Exception as e:
            logging.warning("run_error: %s" % traceback.format_exc())
        return result

    def run_step(self, text):
        cur_sent = preprocess_text(text.strip(), lower=FLAGS.uncased)
        tokens, ids = self.sp.encode_ids(cur_sent)
        sent_len, diff_len = len(ids) - 1, FLAGS.seq_len - len(ids)

        input_ids = ids + [SEP_ID] * (diff_len - 1) + [
            CLS_ID
        ]  #  cat_data = np.concatenate([inp, a_data, sep_array, b_data, sep_array, cls_array])
        input_tokens = tokens + ["<sep>"] * (diff_len - 1) + ["<cls>"]
        input_mask = [1] + [0] * sent_len + [1] * diff_len
        segment_ids = [0] * (sent_len + 1) + [
            2
        ] * diff_len  # seg_id = ([0] * (reuse_len + a_data.shape[0]) + [0] + [1] * b_data.shape[0] + [1] + [2])
        input_ids, input_tokens, input_mask, segment_ids = input_ids[:FLAGS.
                                                                     seq_len], input_tokens[:
                                                                                            FLAGS
                                                                                            .
                                                                                            seq_len], input_mask[:
                                                                                                                 FLAGS
                                                                                                                 .
                                                                                                                 seq_len], segment_ids[:
                                                                                                                                       FLAGS
                                                                                                                                       .
                                                                                                                                       seq_len]
        '''
       logging.info("text: %s, seg_text: %s" % (text, " ".join([str(x) for x in tokens])))
       logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
       logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
       logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
       '''
        il={'text':text,'seg_text':" ".join([str(x) for x in tokens]),'input_ids':" ".join([str(x) for x in input_ids]), \
           'input_mask':" ".join([str(x) for x in input_mask]),'segment_ids':" ".join([str(x) for x in segment_ids])}
        logging.info(json.dumps(il, ensure_ascii=False))

        feed_dict = {
            self.input_ids: [input_ids],
            self.segment_ids: [segment_ids],
            self.input_mask: [input_mask]
        }
        fetch = self.sess.run(
            [self.output, self.attn_prob, self.attention_out], feed_dict)
        out_encode, atten_prob = fetch[0], fetch[1]
        #weight0 = normalization(self.cal_weight(out_encode, input_tokens))
        weight_attn = normalization(self.weight_attenprob(atten_prob, tokens))
        weight_idf = normalization(self.sp.cal_weight_idf(tokens[1:]))
        weight_lm = normalization(self.lm.cal_weight_lm(tokens[1:]))
        weight_rule = self.merge_weight([(weight_attn, 0.5), (weight_idf, 0.5),
                                         (weight_lm, 0.5)])
        self.weight_attn, self.weight_idf, self.weight_lm = weight_attn, weight_idf, weight_lm
        sen2terms = [e for e in tokens[1:]]
        weightrank = self.rank_weight(sen2terms, weight_attn, weight_idf,
                                      weight_lm)
        weight_rank = normalization(weightrank)
        weight = self.merge_weight([(weight_rank, 0.7),
                                    (weight_rule, 0.0)])  # 0.6-0.4
        wl = {'weight_rank':' '.join([str(k)+':'+str(v) for k, v in weight_rank]),'weight_rule':' '.join([str(k)+':'+str(v) for k, v in weight_rule]), \
              'weight': ' '.join([str(k) + ':' + str(v) for k, v in weight])}
        logging.info(json.dumps(wl, ensure_ascii=False))
        return weight

    def rank_weight(self, sen2terms, weight_attn, weight_idf, weight_lm):
        tmp, score_sum = [], 1e-8
        for term in sen2terms:
            feature_vector, _ = get_feature(term, sen2terms, weight_attn,
                                            weight_idf, weight_lm)
            feature = np.array(feature_vector)
            feature_csr = sparse.csr_matrix(feature)
            input = DMatrix(feature_csr)
            score = self.xgb_model.predict(input)[0]
            prob = 1.0 / (1 + math.exp(-1 * score))
            tmp.append((term, prob))
            score_sum += prob
        res = [(k, round(v / score_sum, 3)) for k, v in tmp]
        return res

    def merge_weight(self, weight_tuple):
        weight, weight_sum = [], 1e-8
        for j in range(len(weight_tuple[0][0])):
            tmp = 0.0
            for i in range(len(weight_tuple)):
                (word, val), coef = weight_tuple[i][0][j], weight_tuple[i][1]
                tmp += val * coef
            weight.append((weight_tuple[0][0][j][0], tmp))
            weight_sum += tmp
        token_weight = [(k, round(v / weight_sum, 3)) for k, v in weight]
        return token_weight

    def weight_attenprob(self, attention_probs, input_tokens):
        weights = []
        (row, col, batch, dim) = attention_probs.shape
        for j in range(col):
            tmp = 0.0
            for i in range(row):
                if i == j: continue
                tmp += attention_probs[i][j][0][0]
            weights.append(tmp)
        token_weight = [(input_tokens[i], weights[i])
                        for i in range(min(len(input_tokens), len(weights)))
                        if input_tokens[i] not in special_words]
        token_weights = token_weight + [
            (input_tokens[i], 0.0)
            for i in range(len(token_weight) + 1, len(input_tokens))
        ]
        return token_weights

    def cal_weight(self, encode_vects, input_tokens):
        vects, vect = encode_vects[0], np.sum(encode_vects, axis=1)[0]
        token_weights = [(input_tokens[i], cal_sim(vect, vects[i]))
                         for i in range(len(vects))
                         if input_tokens[i] not in special_words]
        #token_weight = [(input_tokens[i], weight[i-1]) if input_tokens[i] not in special_words else (input_tokens[i], 0.0) for i in range(len(input_tokens))]
        return token_weights

示例#2

显示文件

def _create_data(idx, input_paths):
    # Load sentence-piece model
    #sp = spm.SentencePieceProcessor(); sp.Load(FLAGS.sp_path)
    sp = Tokenizer()

    input_shards = []
    total_line_cnt = 0
    for input_path in input_paths:
        input_data, sent_ids = [], []
        sent_id, line_cnt = True, 0
        tf.logging.info("Processing %s", input_path)
        for line in tf.gfile.Open(input_path):
            if line_cnt % 100000 == 0:
                tf.logging.info("Loading line %d", line_cnt)
            line_cnt += 1

            if not line.strip():
                if FLAGS.use_eod:
                    sent_id = not sent_id
                    cur_sent = [EOD_ID]
                else:
                    continue
            else:
                if FLAGS.from_raw_text:
                    cur_sent = preprocess_text(line.strip(),
                                               lower=FLAGS.uncased)
                    #cur_sent = encode_ids(sp, cur_sent)
                    _, cur_sent = sp.encode_ids(cur_sent)
                    #a=sp.encode_ids("java开发工程师")
                else:
                    cur_sent = list(map(int, line.strip().split()))

            input_data.extend(cur_sent)
            sent_ids.extend([sent_id] * len(cur_sent))
            sent_id = not sent_id

        tf.logging.info("Finish with line %d", line_cnt)
        if line_cnt == 0:
            continue

        input_data = np.array(input_data, dtype=np.int64)
        sent_ids = np.array(sent_ids, dtype=np.bool)

        total_line_cnt += line_cnt
        input_shards.append((input_data, sent_ids))

    tf.logging.info("[Task %d] Total number line: %d", idx, total_line_cnt)

    tfrecord_dir = os.path.join(FLAGS.save_dir, "tfrecords")

    filenames, num_batch = [], 0

    # Randomly shuffle input shards (with a fixed but distinct random seed)
    np.random.seed(100 * FLAGS.task + FLAGS.pass_id)

    perm_indices = np.random.permutation(len(input_shards))
    tf.logging.info("Using perm indices %s for pass %d", perm_indices.tolist(),
                    FLAGS.pass_id)

    input_data_list, sent_ids_list = [], []
    prev_sent_id = None
    for perm_idx in perm_indices:
        input_data, sent_ids = input_shards[perm_idx]
        # make sure the `send_ids[0] == not prev_sent_id`
        if prev_sent_id is not None and sent_ids[0] == prev_sent_id:
            sent_ids = np.logical_not(sent_ids)

        # append to temporary list
        input_data_list.append(input_data)
        sent_ids_list.append(sent_ids)

        # update `prev_sent_id`
        prev_sent_id = sent_ids[-1]

    input_data = np.concatenate(input_data_list)
    sent_ids = np.concatenate(sent_ids_list)

    file_name, cur_num_batch = create_tfrecords(
        save_dir=tfrecord_dir,
        basename="{}-{}-{}".format(FLAGS.split, idx, FLAGS.pass_id),
        data=[input_data, sent_ids],
        bsz_per_host=FLAGS.bsz_per_host,
        seq_len=FLAGS.seq_len,
        bi_data=FLAGS.bi_data,
        sp=sp,
    )

    filenames.append(file_name)
    num_batch += cur_num_batch

    record_info = {"filenames": filenames, "num_batch": num_batch}

    return record_info