Пример #1
0
 def __init__(self):
     self.seg = Tokenizer()
     self.lm = kenlm.Model(conf.lm)
     self.punctuation_list = ".。,,,、??::;;{}[]【】“‘’”《》/!!%……()<>@#$~^¥%&*\"\'=+-_——「」"
     self.stopwords = [
         e.strip()
         for e in open(conf.stop_words, encoding="utf8").readlines()
     ]
Пример #2
0
def gen_true_data(source_path, out_path):
    t = Tokenizer()
    res = []
    total_num = len(open(source_path, encoding="utf8").readlines())
    with open(source_path, encoding="utf8") as fin:
        for line in tqdm(fin, total=total_num):
            query = line.strip().split("\t")[0]
            senten2term, _ = t.tokenize(query)
            if len(senten2term) < 2: continue
            res.append("\t".join(senten2term) + "\n")
    with open(out_path, "w", encoding="utf8") as fin:
        fin.write("".join(res))
Пример #3
0
def get_corpus(file_path="position_name_desc_re"):
    title_entitys = {}
    token = Tokenizer()
    for file_name in os.listdir(file_path):  # 遍历文件夹里的文件
        text = [
            line.strip().lower().replace("\\n", "").split('\t')
            for line in open(file_path + "/" +
                             file_name, encoding="utf8").readlines()
        ]
        for line in tqdm(text, total=len(text)):
            if len(line) <= 1: continue
            import_tokens = token.select_important_tokens("".join(line[1:]))
            if line[0] not in title_entitys: title_entitys[line[0]] = []
            title_entitys[line[0]].extend(import_tokens)
        a = 1
Пример #4
0
def label_data(path, out_path):
    print("generage query weighting label data")
    t = Tokenizer() ; res = []
    total_num = len(open(path, encoding="utf8").readlines())
    for i, line in enumerate(tqdm(open(path, encoding="utf8"), total=total_num)):
        line_info = json.loads(line)
        cv_info, jd_info = json.loads(line_info['cv']), json.loads(line_info['jd'])
        senten2term, word_seg = t.tokenize(jd_info['name'])
        weight_cv = cv_weight(cv_info, senten2term, t)
        weight_jd = jd_weight(jd_info, senten2term, t)
        tmp = "\t".join([(weight_cv[i][0] + ":" + str(round(0.6 * weight_jd[i][1] + 0.4 * weight_cv[i][1], 3))) for i in range(len(weight_cv))]) + "\n"
        res.append(tmp)
    print("writing label data %s" % (out_path))
    with open(out_path, "w", encoding="utf8") as fin:
        fin.write("".join(res))
Пример #5
0
class LanguageModelScore:
    """Value = {perplexity (entire query) / perplexity (entire query without current term)}. This feature reflects the query quality with/without current term"""
    def __init__(self):
        self.seg = Tokenizer()
        self.lm = kenlm.Model(conf.lm)
        self.punctuation_list = ".。,,,、??::;;{}[]【】“‘’”《》/!!%……()<>@#$~^¥%&*\"\'=+-_——「」"
        self.stopwords = [
            e.strip()
            for e in open(conf.stop_words, encoding="utf8").readlines()
        ]

    def weight_lm(self, sentence):
        senten2term, word_seg = self.seg.tokenize(sentence)
        total_score = self.lm.perplexity(' '.join(senten2term))
        weight, weight_sum = [], 0.0
        for i in range(len(senten2term)):
            tmp = [senten2term[j] for j in range(len(senten2term)) if i != j]
            score = self.lm.perplexity((' '.join(tmp)))
            val = total_score / score
            if senten2term[i] in self.punctuation_list or senten2term[
                    i] in self.stopwords:
                val = 0.0
            weight.append((senten2term[i], val))
            weight_sum += val
        token_weight = [(k, round(v / weight_sum, 3)) for k, v in weight]
        return token_weight
Пример #6
0
 def __init__(self, model_type='rnn', ckpt_num=0):
     tf.logging.set_verbosity(tf.logging.INFO)
     self.tokenizer = Tokenizer()
     self.encoder = Encoder(model_type)
     self.estimator_save_name = FLAGS.model_dir + "/model.ckpt-" + str(
         ckpt_num)
     # 模型输入
     self.a_in = tf.placeholder(tf.int32, [None, SEQ_LEN],
                                name='a')  # [batch_size, SEQ_LEN]
     self.b_in = tf.placeholder(
         tf.int32, [None, None, SEQ_LEN],
         name='b')  # [batch_size, 1 + MAX_NUM_NEG, SEQ_LEN]
     self.is_training = tf.placeholder_with_default(False, shape=())
     # 创建session
     self.session = tf.Session()
     self.word_embed, self.intent_embed = self.encoder.create_tf_embed(
         self.a_in, self.b_in, self.is_training)  # 语义编码
     tf.train.Saver().restore(self.session, self.estimator_save_name)
Пример #7
0
    def __init__(self, ckpt_num=156000, is_training=False):
        #init_log()
        self.logs = {}
        batch_size = 1
        logging.info("Init query weight model ...")
        self.sp = Tokenizer()
        self.lm = language_model()
        self.xgb_model = xgb.Booster(model_file=conf.rank_model)
        #self.xgb_dict = parse_xgb_dict(conf.rank_model + '.txt')
        tf.logging.set_verbosity(tf.logging.INFO)
        tf_float = tf.bfloat16 if FLAGS.use_bfloat16 else tf.float32
        self.input_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[batch_size, FLAGS.seq_len],
                                        name="input_ids")
        self.segment_ids = tf.placeholder(dtype=tf.int32,
                                          shape=[batch_size, FLAGS.seq_len],
                                          name="segment_ids")
        self.input_mask = tf.placeholder(dtype=tf_float,
                                         shape=[batch_size, FLAGS.seq_len],
                                         name="input_mask")
        self.label_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[batch_size],
                                        name="label_ids")
        inp = tf.transpose(self.input_ids, [1, 0])
        seg_id = tf.transpose(self.segment_ids, [1, 0])
        inp_mask = tf.transpose(self.input_mask, [1, 0])
        self.sess = tf.Session()
        xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
        run_config = xlnet.create_run_config(is_training, True, FLAGS)

        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=inp,
                                       seg_ids=seg_id,
                                       input_mask=inp_mask)
        self.output, self.attn_prob, self.attention_out = xlnet_model.output_encode, xlnet_model.attn_prob, xlnet_model.attention_out

        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))
        xlnet_model.saver.restore(
            self.sess, FLAGS.init_checkpoint + "/model.ckpt-" + str(ckpt_num))
        #### load pretrained models
        # scaffold_fn = model_utils.init_from_checkpoint(FLAGS)
        logging.info("Init query weight model finished ...")
Пример #8
0
class queryDocSim:
    def __init__(self, model_type='rnn', ckpt_num=14000):
        tf.logging.set_verbosity(tf.logging.INFO)
        self.tokenizer = Tokenizer()
        self.encoder = Encoder(model_type)
        self.estimator_save_name = FLAGS.model_dir + "/model.ckpt-" + str(
            ckpt_num)
        # 模型输入
        self.a_in = tf.placeholder(tf.int32, [None, SEQ_LEN],
                                   name='a')  # [batch_size, SEQ_LEN]
        self.b_in = tf.placeholder(
            tf.int32, [None, None, SEQ_LEN],
            name='b')  # [batch_size, 1 + MAX_NUM_NEG, SEQ_LEN]
        self.is_training = tf.placeholder_with_default(False, shape=())
        # 创建session
        self.session = tf.Session()
        self.word_embed, self.intent_embed = self.encoder.create_tf_embed(
            self.a_in, self.b_in, self.is_training)  # 语义编码
        tf.train.Saver().restore(self.session,
                                 self.estimator_save_name)  # 加载模型

    def run_step(self, entity, entity_list):
        x_batch = np.array([seq2ids(entity)])
        y_batch = np.array([[seq2ids(e) for e in entity_list]])
        fetch = self.session.run(
            {'emb_a': self.encoder.emb_a, 'emb_b': self.encoder.emb_b, 'sim_ab': self.encoder.sim_ab}, \
            feed_dict={self.a_in: x_batch, self.b_in: y_batch, self.is_training: False})
        res = {
            entity + "-" + e: fetch['sim_ab'][0][i]
            for i, e in enumerate(entity_list)
        }
        sorted_res = sorted(res.items(), key=lambda d: d[1], reverse=True)
        return sorted_res

    def sim(self, query, doc, topk=5):
        sim_sum = 0.0
        tmp = self.tokenizer.select_important_tokens(doc)
        sim_res = self.run_step(query, tmp)
        prob_res = [(k, sigmoid(v)) for k, v in sim_res]
        for k, v in prob_res[:topk]:
            sim_sum += v
        sim_avg = round(sim_sum / topk, 3)
        return sim_avg, prob_res

    def cal_sim(self, req_dict):
        similarity, prob_res = 0.0, []
        try:
            query = req_dict['request']['p']['query']
            doc = req_dict['request']['p']['doc']
            similarity, prob_res = self.sim(query, doc)
        except Exception as e:
            logging.warning("run_error: %s" % traceback.format_exc())
        return similarity, prob_res
Пример #9
0
def gen_entity_dict():
    token = Tokenizer()
    '''
    title_freq = Counter([line.split("\t")[0] for line in open("data/jdtitledesc", encoding="utf8").readlines()])
    top_title_freq = title_freq.most_common()
    with open("data/total_title", "w", encoding="utf8") as fin:
        for t, f in top_title_freq:
            fin.write(t + "\t" + str(f) + "\n")
    '''
    match_obj = re.compile("(.+)\t([0-9]+)", re.M | re.I)
    titles, title_words = [], []
    stop_word_re = "[" + "|".join(STOP_WORDS) + "]{1,}"
    custom_word_re = "[急聘|诚聘|双休|代表|高薪|五险]{1,}"
    punction_re = "[" + "\|".join([e for e in PUNCTUATION_LIST]) + "]{1,}"
    salary_re = "50k"
    t = re.sub(custom_word_re + stop_word_re, "", "50k,{急聘客服专员(双休  五险一金)")
    text = [
        line.strip().lower()
        for line in open("data/total_title", encoding="utf8").readlines()
    ]
    for line in tqdm(text, total=len(text)):
        match_res = match_obj.match(line)
        if not match_res: continue
        title, freq = match_res.group(1), int(match_res.group(2))
        if freq <= 2 or len(title) >= 10: continue
        #title = "50k,急聘客服专员(双休  五险一金)"
        title = re.split("[(|( )/]", title)[0]
        title = re.sub(custom_word_re + stop_word_re, "", title)
        titles.append(title)
        title_words.extend(token.cut(title)[0])
    title_freq = Counter(title_words).most_common()
    with open("data/title_entitys", "w", encoding="utf8") as fin:
        for t, f in title_freq:
            fin.write(t + "\n")
    with open("data/valid_titles", "w", encoding="utf8") as fin:
        for t, f in Counter(titles).most_common():
            fin.write(t + "\n")
    exit()
Пример #10
0
class entitySimilar:
    def __init__(self, model_type='rnn', ckpt_num=0):
        tf.logging.set_verbosity(tf.logging.INFO)
        self.tokenizer = Tokenizer()
        self.encoder = Encoder(model_type)
        self.estimator_save_name = FLAGS.model_dir + "/model.ckpt-" + str(
            ckpt_num)
        # 模型输入
        self.a_in = tf.placeholder(tf.int32, [None, SEQ_LEN],
                                   name='a')  # [batch_size, SEQ_LEN]
        self.b_in = tf.placeholder(
            tf.int32, [None, None, SEQ_LEN],
            name='b')  # [batch_size, 1 + MAX_NUM_NEG, SEQ_LEN]
        self.is_training = tf.placeholder_with_default(False, shape=())
        # 创建session
        self.session = tf.Session()
        self.word_embed, self.intent_embed = self.encoder.create_tf_embed(
            self.a_in, self.b_in, self.is_training)  # 语义编码
        tf.train.Saver().restore(self.session, self.estimator_save_name)

    def run_step(self, entity, entity_list):
        x_batch = np.array([seq2ids(entity)])
        y_batch = np.array([[seq2ids(e) for e in entity_list]])
        fetch = self.session.run(
            {'emb_a': self.encoder.emb_a, 'emb_b': self.encoder.emb_b, 'sim_ab': self.encoder.sim_ab}, \
            feed_dict={self.a_in: x_batch, self.b_in: y_batch, self.is_training: False})
        res = {
            entity + "-" + e: fetch['sim_ab'][0][i]
            for i, e in enumerate(entity_list)
        }
        sorted_res = sorted(res.items(), key=lambda d: d[1], reverse=True)
        return sorted_res

    def analyze(self, word, text):
        tmp = self.tokenizer.select_important_tokens(text)
        sim_res = self.run_step(word, tmp)
        prob_res = [(k, sigmoid(v)) for k, v in sim_res]
        pass
Пример #11
0
class query_weight:
    def __init__(self, ckpt_num=156000, is_training=False):
        #init_log()
        batch_size = 1
        logging.info("Init query weight model ...")
        self.sp = Tokenizer()
        self.lm = language_model()
        self.xgb_model = xgb.Booster(model_file=conf.rank_model)
        tf.logging.set_verbosity(tf.logging.INFO)
        tf_float = tf.bfloat16 if FLAGS.use_bfloat16 else tf.float32
        self.input_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[batch_size, FLAGS.seq_len],
                                        name="input_ids")
        self.segment_ids = tf.placeholder(dtype=tf.int32,
                                          shape=[batch_size, FLAGS.seq_len],
                                          name="segment_ids")
        self.input_mask = tf.placeholder(dtype=tf_float,
                                         shape=[batch_size, FLAGS.seq_len],
                                         name="input_mask")
        self.label_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[batch_size],
                                        name="label_ids")
        inp = tf.transpose(self.input_ids, [1, 0])
        seg_id = tf.transpose(self.segment_ids, [1, 0])
        inp_mask = tf.transpose(self.input_mask, [1, 0])
        self.sess = tf.Session()
        xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
        run_config = xlnet.create_run_config(is_training, True, FLAGS)

        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=inp,
                                       seg_ids=seg_id,
                                       input_mask=inp_mask)
        self.output, self.attn_prob, self.attention_out = xlnet_model.output_encode, xlnet_model.attn_prob, xlnet_model.attention_out

        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))
        xlnet_model.saver.restore(
            self.sess, FLAGS.init_checkpoint + "/model.ckpt-" + str(ckpt_num))
        #### load pretrained models
        # scaffold_fn = model_utils.init_from_checkpoint(FLAGS)
        logging.info("Init query weight model finished ...")

    def run(self, req_dict):
        result = None
        try:
            query = req_dict["request"]["p"]["query"]
            result = self.run_step(query)
        except Exception as e:
            logging.warning("run_error: %s" % traceback.format_exc())
        return result

    def run_step(self, text):
        cur_sent = preprocess_text(text.strip(), lower=FLAGS.uncased)
        tokens, ids = self.sp.encode_ids(cur_sent)
        sent_len, diff_len = len(ids) - 1, FLAGS.seq_len - len(ids)

        input_ids = ids + [SEP_ID] * (diff_len - 1) + [
            CLS_ID
        ]  #  cat_data = np.concatenate([inp, a_data, sep_array, b_data, sep_array, cls_array])
        input_tokens = tokens + ["<sep>"] * (diff_len - 1) + ["<cls>"]
        input_mask = [1] + [0] * sent_len + [1] * diff_len
        segment_ids = [0] * (sent_len + 1) + [
            2
        ] * diff_len  # seg_id = ([0] * (reuse_len + a_data.shape[0]) + [0] + [1] * b_data.shape[0] + [1] + [2])
        input_ids, input_tokens, input_mask, segment_ids = input_ids[:FLAGS.
                                                                     seq_len], input_tokens[:
                                                                                            FLAGS
                                                                                            .
                                                                                            seq_len], input_mask[:
                                                                                                                 FLAGS
                                                                                                                 .
                                                                                                                 seq_len], segment_ids[:
                                                                                                                                       FLAGS
                                                                                                                                       .
                                                                                                                                       seq_len]
        '''
       logging.info("text: %s, seg_text: %s" % (text, " ".join([str(x) for x in tokens])))
       logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
       logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
       logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
       '''
        il={'text':text,'seg_text':" ".join([str(x) for x in tokens]),'input_ids':" ".join([str(x) for x in input_ids]), \
           'input_mask':" ".join([str(x) for x in input_mask]),'segment_ids':" ".join([str(x) for x in segment_ids])}
        logging.info(json.dumps(il, ensure_ascii=False))

        feed_dict = {
            self.input_ids: [input_ids],
            self.segment_ids: [segment_ids],
            self.input_mask: [input_mask]
        }
        fetch = self.sess.run(
            [self.output, self.attn_prob, self.attention_out], feed_dict)
        out_encode, atten_prob = fetch[0], fetch[1]
        #weight0 = normalization(self.cal_weight(out_encode, input_tokens))
        weight_attn = normalization(self.weight_attenprob(atten_prob, tokens))
        weight_idf = normalization(self.sp.cal_weight_idf(tokens[1:]))
        weight_lm = normalization(self.lm.cal_weight_lm(tokens[1:]))
        weight_rule = self.merge_weight([(weight_attn, 0.5), (weight_idf, 0.5),
                                         (weight_lm, 0.5)])
        self.weight_attn, self.weight_idf, self.weight_lm = weight_attn, weight_idf, weight_lm
        sen2terms = [e for e in tokens[1:]]
        weightrank = self.rank_weight(sen2terms, weight_attn, weight_idf,
                                      weight_lm)
        weight_rank = normalization(weightrank)
        weight = self.merge_weight([(weight_rank, 0.7),
                                    (weight_rule, 0.0)])  # 0.6-0.4
        wl = {'weight_rank':' '.join([str(k)+':'+str(v) for k, v in weight_rank]),'weight_rule':' '.join([str(k)+':'+str(v) for k, v in weight_rule]), \
              'weight': ' '.join([str(k) + ':' + str(v) for k, v in weight])}
        logging.info(json.dumps(wl, ensure_ascii=False))
        return weight

    def rank_weight(self, sen2terms, weight_attn, weight_idf, weight_lm):
        tmp, score_sum = [], 1e-8
        for term in sen2terms:
            feature_vector, _ = get_feature(term, sen2terms, weight_attn,
                                            weight_idf, weight_lm)
            feature = np.array(feature_vector)
            feature_csr = sparse.csr_matrix(feature)
            input = DMatrix(feature_csr)
            score = self.xgb_model.predict(input)[0]
            prob = 1.0 / (1 + math.exp(-1 * score))
            tmp.append((term, prob))
            score_sum += prob
        res = [(k, round(v / score_sum, 3)) for k, v in tmp]
        return res

    def merge_weight(self, weight_tuple):
        weight, weight_sum = [], 1e-8
        for j in range(len(weight_tuple[0][0])):
            tmp = 0.0
            for i in range(len(weight_tuple)):
                (word, val), coef = weight_tuple[i][0][j], weight_tuple[i][1]
                tmp += val * coef
            weight.append((weight_tuple[0][0][j][0], tmp))
            weight_sum += tmp
        token_weight = [(k, round(v / weight_sum, 3)) for k, v in weight]
        return token_weight

    def weight_attenprob(self, attention_probs, input_tokens):
        weights = []
        (row, col, batch, dim) = attention_probs.shape
        for j in range(col):
            tmp = 0.0
            for i in range(row):
                if i == j: continue
                tmp += attention_probs[i][j][0][0]
            weights.append(tmp)
        token_weight = [(input_tokens[i], weights[i])
                        for i in range(min(len(input_tokens), len(weights)))
                        if input_tokens[i] not in special_words]
        token_weights = token_weight + [
            (input_tokens[i], 0.0)
            for i in range(len(token_weight) + 1, len(input_tokens))
        ]
        return token_weights

    def cal_weight(self, encode_vects, input_tokens):
        vects, vect = encode_vects[0], np.sum(encode_vects, axis=1)[0]
        token_weights = [(input_tokens[i], cal_sim(vect, vects[i]))
                         for i in range(len(vects))
                         if input_tokens[i] not in special_words]
        #token_weight = [(input_tokens[i], weight[i-1]) if input_tokens[i] not in special_words else (input_tokens[i], 0.0) for i in range(len(input_tokens))]
        return token_weights
Пример #12
0
def _create_data(idx, input_paths):
    # Load sentence-piece model
    #sp = spm.SentencePieceProcessor(); sp.Load(FLAGS.sp_path)
    sp = Tokenizer()

    input_shards = []
    total_line_cnt = 0
    for input_path in input_paths:
        input_data, sent_ids = [], []
        sent_id, line_cnt = True, 0
        tf.logging.info("Processing %s", input_path)
        for line in tf.gfile.Open(input_path):
            if line_cnt % 100000 == 0:
                tf.logging.info("Loading line %d", line_cnt)
            line_cnt += 1

            if not line.strip():
                if FLAGS.use_eod:
                    sent_id = not sent_id
                    cur_sent = [EOD_ID]
                else:
                    continue
            else:
                if FLAGS.from_raw_text:
                    cur_sent = preprocess_text(line.strip(),
                                               lower=FLAGS.uncased)
                    #cur_sent = encode_ids(sp, cur_sent)
                    _, cur_sent = sp.encode_ids(cur_sent)
                    #a=sp.encode_ids("java开发工程师")
                else:
                    cur_sent = list(map(int, line.strip().split()))

            input_data.extend(cur_sent)
            sent_ids.extend([sent_id] * len(cur_sent))
            sent_id = not sent_id

        tf.logging.info("Finish with line %d", line_cnt)
        if line_cnt == 0:
            continue

        input_data = np.array(input_data, dtype=np.int64)
        sent_ids = np.array(sent_ids, dtype=np.bool)

        total_line_cnt += line_cnt
        input_shards.append((input_data, sent_ids))

    tf.logging.info("[Task %d] Total number line: %d", idx, total_line_cnt)

    tfrecord_dir = os.path.join(FLAGS.save_dir, "tfrecords")

    filenames, num_batch = [], 0

    # Randomly shuffle input shards (with a fixed but distinct random seed)
    np.random.seed(100 * FLAGS.task + FLAGS.pass_id)

    perm_indices = np.random.permutation(len(input_shards))
    tf.logging.info("Using perm indices %s for pass %d", perm_indices.tolist(),
                    FLAGS.pass_id)

    input_data_list, sent_ids_list = [], []
    prev_sent_id = None
    for perm_idx in perm_indices:
        input_data, sent_ids = input_shards[perm_idx]
        # make sure the `send_ids[0] == not prev_sent_id`
        if prev_sent_id is not None and sent_ids[0] == prev_sent_id:
            sent_ids = np.logical_not(sent_ids)

        # append to temporary list
        input_data_list.append(input_data)
        sent_ids_list.append(sent_ids)

        # update `prev_sent_id`
        prev_sent_id = sent_ids[-1]

    input_data = np.concatenate(input_data_list)
    sent_ids = np.concatenate(sent_ids_list)

    file_name, cur_num_batch = create_tfrecords(
        save_dir=tfrecord_dir,
        basename="{}-{}-{}".format(FLAGS.split, idx, FLAGS.pass_id),
        data=[input_data, sent_ids],
        bsz_per_host=FLAGS.bsz_per_host,
        seq_len=FLAGS.seq_len,
        bi_data=FLAGS.bi_data,
        sp=sp,
    )

    filenames.append(file_name)
    num_batch += cur_num_batch

    record_info = {"filenames": filenames, "num_batch": num_batch}

    return record_info
Пример #13
0
import os, traceback, logging, time, re, sys
from pyspark import SparkContext, SparkConf
from seg_utils import Tokenizer

static_jd_title = "titledesc"  # titledesc, title, desc

token = Tokenizer()


def parse_line_jdtitle(line):
    title = []
    try:
        seg_line = line.strip().split('\t')
        if seg_line[0].isdigit() and len(seg_line) >= 4:
            title.append("_".join([seg_line[0], seg_line[3]]))
    except Exception as e:
        logging.warning('parse_line_jdtitle_err=%s,line:%s' % (repr(e), line))
    return title


def parse_line_jd(line):
    desc = []
    try:
        seg_line = line.strip().split('\t')
        if seg_line[0].isdigit() and len(seg_line) >= 34:
            important_tokens = token.select_important_tokens(
                seg_line[33].replace('\\n', ""))
            desc = ["_".join([seg_line[0], e]) for e in important_tokens]
    except Exception as e:
        logging.warning('parse_line_jd_err=%s,line:%s' % (repr(e), line))
    return desc
Пример #14
0
 def __init__(self):
     self.tokenizer = Tokenizer()
Пример #15
0
class TrainData():
    def __init__(self):
        self.tokenizer = Tokenizer()

    def original2corp(self):
        text = []
        print("extract corpu from original file: %s --> corpus file: %s" %
              (FLAGS.original_file, FLAGS.corpus_file))
        for line in open(FLAGS.original_file, encoding="utf8").readlines():
            try:
                e = line.strip().split("\t")[33].replace("\\n", "").lower()
            except:
                continue
            a = line.strip().split("\t")
            text.append(e)
        with open(FLAGS.corpus_file, "w", encoding="utf8") as fin:
            fin.write("\n".join(text))

    def gen_train_samples(self):
        self.original2corp()
        sample_set = {}
        np.random.seed(8)
        # 加载数据,以文本为单位
        important_tokens = []
        text = open(FLAGS.corpus_file, encoding="utf8").readlines()[:10]
        print("select important tokens...")
        for e in tqdm(text, total=len(text)):
            tmp = self.tokenizer.select_important_tokens(clean_line(e.strip()))
            if len(tmp) < 10: continue
            important_tokens.append(tmp)
        # 采样正负样本,同一个文本中的词为正样本,不同文本中的词为负样本
        print("sample(1+k negative) train and valid set...")
        num_neg = min(len(important_tokens) - 1, MAX_NUM_NEG)
        for cur_index, cur_ele in tqdm(enumerate(important_tokens),
                                       total=len(important_tokens)):
            np.random.shuffle(cur_ele)
            cut_index = int(len(cur_ele) / 3)
            lhs, rhs = cur_ele[:cut_index], cur_ele[cut_index:]
            for word_index, word in enumerate(lhs):
                if word in sample_set: continue
                positive_entity = rhs[word_index]  # 正样本
                # 负采样
                negative_entitys, negs = [], []
                negative_indexes = [
                    i for i in range(len(important_tokens)) if i != cur_index
                ]
                random.shuffle(negative_indexes)
                for e in negative_indexes:
                    if (len(negs) >= num_neg): break
                    if word in important_tokens[
                            e] or positive_entity in important_tokens[e]:
                        continue
                    negs.append(e)
                for neg_index in negs:
                    while True:
                        neg_tmp = random.sample(important_tokens[neg_index],
                                                1)[0]
                        if neg_tmp != word and neg_tmp not in negative_entitys:
                            break
                    negative_entitys.append(neg_tmp)
                assert len(negative_entitys) == num_neg
                # 采样数少的情况下进行填充
                #if len(negative_entitys) < num_neg:
                #    negative_entitys += ["PAD"] * (num_neg - len(negative_entitys))
                sample_set[word] = [positive_entity, negative_entitys]
        # 产生字典
        token_freq = defaultdict(int)
        token_freq["UNKNOWN"] = 1e8
        #token_freq["PAD"] = 1e8-1
        for k, (p, n) in sample_set.items():
            tmp = [k, p] + n
            for t in tmp:
                if re_en.fullmatch(t): token_freq[t] += 1
                else:
                    for e in list(t):
                        token_freq[e] += 1
        sorted_token_freq = sorted(token_freq.items(),
                                   key=lambda d: d[1],
                                   reverse=True)[:VOCAB_SIZE]
        word2id = {w: i for i, (w, f) in enumerate(sorted_token_freq)}
        if conf.over_write_vocab:
            print("generate word2id file: %s" % (conf.vocab))
            json.dump(word2id,
                      open(conf.vocab, "w", encoding="utf8"),
                      ensure_ascii=False,
                      indent=2)
        _keys_ = list(sample_set.keys())
        train_set = {
            k: sample_set[k]
            for k in _keys_[:int(len(_keys_) * conf.train_valid_ratio)]
        }
        valid_set = {
            k: sample_set[k]
            for k in _keys_[int(len(_keys_) * conf.train_valid_ratio):]
        }
        print("total_sample: %d\ttrain_sample: %d\tvalid_sample :%d" %
              (len(sample_set), len(train_set), len(valid_set)))
        print("generate train sample file :%s\tvalid sample file: %s" %
              (conf.train_samples, conf.valid_samples))
        json.dump(train_set,
                  open(conf.train_samples, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)
        json.dump(valid_set,
                  open(conf.valid_samples, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)

    def gen_vocab(self, title2entitys):
        token_freq = defaultdict(int)
        token_freq["UNKNOWN"] = 1e8
        for title, entitys in title2entitys.items():
            line = [title] + entitys
            for t in line:
                if re_en.fullmatch(t): token_freq[t] += 1
                else:
                    for e in list(t):
                        token_freq[e] += 1
        sorted_token_freq = sorted(token_freq.items(),
                                   key=lambda d: d[1],
                                   reverse=True)[:VOCAB_SIZE]
        word2id = {w: i for i, (w, f) in enumerate(sorted_token_freq)}
        print("generate word2id file: %s" % (conf.vocab))
        json.dump(word2id,
                  open(conf.vocab, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)

    def gen_train_sample_based_title_desc(self):
        entity_dicts = {
            line.strip(): 1
            for line in open(conf.new_entity_file,
                             encoding="utf8").readlines()
        }
        valid_titles = {
            line.strip(): 1
            for line in open("data/valid_titles", encoding="utf8").readlines()
        }
        title_entitys, entity_title, sample_set = {}, {}, []
        matchObj = re.compile(r'(.+)&([0-9]+)', re.M | re.I)
        title2entitys = {line.strip().lower().split('\t')[0]: line.strip().lower().split('\t')[1:] \
          for line in open("data/cv_title2entitys_corpu", encoding="utf8").readlines()}
        title_entitys = {
            k: v
            for k, v in title2entitys.items() if len(v) >= 10 and len(v) < 20
        }
        if conf.over_write_vocab: self.gen_vocab(title_entitys)
        _keys_ = list(title_entitys.keys())
        print("sample(1+k negative) train and valid set...")
        num_neg = min(len(title_entitys) - 1, MAX_NUM_NEG)
        # 采样
        for title, entitys in tqdm(title_entitys.items(),
                                   total=len(title_entitys)):
            positive_entitys = random.sample(entitys, min(len(entitys),
                                                          10))  # 正样本
            negative_titles_candidate = [e for e in _keys_ if e != title]
            for pos_entity in positive_entitys:  # 负样本
                negative_entitys = []
                negs = random.sample(negative_titles_candidate, num_neg)
                for neg_tit in negs:
                    try:
                        negative_entitys.append(
                            random.sample(title_entitys[neg_tit], 1)[0])
                    except:
                        a = 1
                if len(negative_entitys) < num_neg:
                    negative_entitys += [negative_entitys[0]
                                         ] * (num_neg - len(negative_entitys))
                assert len(negative_entitys) == num_neg
                sample_set.append([title, pos_entity, list(negative_entitys)])
        #exit()
        train_set = {
            i: ele
            for i, ele in enumerate(
                sample_set[:int(len(sample_set) * conf.train_valid_ratio)])
        }
        valid_set = {
            i: ele
            for i, ele in enumerate(
                sample_set[int(len(sample_set) * conf.train_valid_ratio):])
        }
        print("total_sample: %d\ttrain_sample: %d\tvalid_sample :%d" %
              (len(sample_set), len(train_set), len(valid_set)))
        print("generate train sample file :%s\tvalid sample file: %s" %
              (FLAGS.train_samples, FLAGS.valid_samples))
        json.dump(train_set,
                  open(FLAGS.train_samples, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)
        json.dump(valid_set,
                  open(FLAGS.valid_samples, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)
        a = 1