Пример #1
0
class queryDocSim:
    def __init__(self, model_type='rnn', ckpt_num=14000):
        tf.logging.set_verbosity(tf.logging.INFO)
        self.tokenizer = Tokenizer()
        self.encoder = Encoder(model_type)
        self.estimator_save_name = FLAGS.model_dir + "/model.ckpt-" + str(
            ckpt_num)
        # 模型输入
        self.a_in = tf.placeholder(tf.int32, [None, SEQ_LEN],
                                   name='a')  # [batch_size, SEQ_LEN]
        self.b_in = tf.placeholder(
            tf.int32, [None, None, SEQ_LEN],
            name='b')  # [batch_size, 1 + MAX_NUM_NEG, SEQ_LEN]
        self.is_training = tf.placeholder_with_default(False, shape=())
        # 创建session
        self.session = tf.Session()
        self.word_embed, self.intent_embed = self.encoder.create_tf_embed(
            self.a_in, self.b_in, self.is_training)  # 语义编码
        tf.train.Saver().restore(self.session,
                                 self.estimator_save_name)  # 加载模型

    def run_step(self, entity, entity_list):
        x_batch = np.array([seq2ids(entity)])
        y_batch = np.array([[seq2ids(e) for e in entity_list]])
        fetch = self.session.run(
            {'emb_a': self.encoder.emb_a, 'emb_b': self.encoder.emb_b, 'sim_ab': self.encoder.sim_ab}, \
            feed_dict={self.a_in: x_batch, self.b_in: y_batch, self.is_training: False})
        res = {
            entity + "-" + e: fetch['sim_ab'][0][i]
            for i, e in enumerate(entity_list)
        }
        sorted_res = sorted(res.items(), key=lambda d: d[1], reverse=True)
        return sorted_res

    def sim(self, query, doc, topk=5):
        sim_sum = 0.0
        tmp = self.tokenizer.select_important_tokens(doc)
        sim_res = self.run_step(query, tmp)
        prob_res = [(k, sigmoid(v)) for k, v in sim_res]
        for k, v in prob_res[:topk]:
            sim_sum += v
        sim_avg = round(sim_sum / topk, 3)
        return sim_avg, prob_res

    def cal_sim(self, req_dict):
        similarity, prob_res = 0.0, []
        try:
            query = req_dict['request']['p']['query']
            doc = req_dict['request']['p']['doc']
            similarity, prob_res = self.sim(query, doc)
        except Exception as e:
            logging.warning("run_error: %s" % traceback.format_exc())
        return similarity, prob_res
Пример #2
0
def get_corpus(file_path="position_name_desc_re"):
    title_entitys = {}
    token = Tokenizer()
    for file_name in os.listdir(file_path):  # 遍历文件夹里的文件
        text = [
            line.strip().lower().replace("\\n", "").split('\t')
            for line in open(file_path + "/" +
                             file_name, encoding="utf8").readlines()
        ]
        for line in tqdm(text, total=len(text)):
            if len(line) <= 1: continue
            import_tokens = token.select_important_tokens("".join(line[1:]))
            if line[0] not in title_entitys: title_entitys[line[0]] = []
            title_entitys[line[0]].extend(import_tokens)
        a = 1
Пример #3
0
class entitySimilar:
    def __init__(self, model_type='rnn', ckpt_num=0):
        tf.logging.set_verbosity(tf.logging.INFO)
        self.tokenizer = Tokenizer()
        self.encoder = Encoder(model_type)
        self.estimator_save_name = FLAGS.model_dir + "/model.ckpt-" + str(
            ckpt_num)
        # 模型输入
        self.a_in = tf.placeholder(tf.int32, [None, SEQ_LEN],
                                   name='a')  # [batch_size, SEQ_LEN]
        self.b_in = tf.placeholder(
            tf.int32, [None, None, SEQ_LEN],
            name='b')  # [batch_size, 1 + MAX_NUM_NEG, SEQ_LEN]
        self.is_training = tf.placeholder_with_default(False, shape=())
        # 创建session
        self.session = tf.Session()
        self.word_embed, self.intent_embed = self.encoder.create_tf_embed(
            self.a_in, self.b_in, self.is_training)  # 语义编码
        tf.train.Saver().restore(self.session, self.estimator_save_name)

    def run_step(self, entity, entity_list):
        x_batch = np.array([seq2ids(entity)])
        y_batch = np.array([[seq2ids(e) for e in entity_list]])
        fetch = self.session.run(
            {'emb_a': self.encoder.emb_a, 'emb_b': self.encoder.emb_b, 'sim_ab': self.encoder.sim_ab}, \
            feed_dict={self.a_in: x_batch, self.b_in: y_batch, self.is_training: False})
        res = {
            entity + "-" + e: fetch['sim_ab'][0][i]
            for i, e in enumerate(entity_list)
        }
        sorted_res = sorted(res.items(), key=lambda d: d[1], reverse=True)
        return sorted_res

    def analyze(self, word, text):
        tmp = self.tokenizer.select_important_tokens(text)
        sim_res = self.run_step(word, tmp)
        prob_res = [(k, sigmoid(v)) for k, v in sim_res]
        pass
Пример #4
0
class TrainData():
    def __init__(self):
        self.tokenizer = Tokenizer()

    def original2corp(self):
        text = []
        print("extract corpu from original file: %s --> corpus file: %s" %
              (FLAGS.original_file, FLAGS.corpus_file))
        for line in open(FLAGS.original_file, encoding="utf8").readlines():
            try:
                e = line.strip().split("\t")[33].replace("\\n", "").lower()
            except:
                continue
            a = line.strip().split("\t")
            text.append(e)
        with open(FLAGS.corpus_file, "w", encoding="utf8") as fin:
            fin.write("\n".join(text))

    def gen_train_samples(self):
        self.original2corp()
        sample_set = {}
        np.random.seed(8)
        # 加载数据,以文本为单位
        important_tokens = []
        text = open(FLAGS.corpus_file, encoding="utf8").readlines()[:10]
        print("select important tokens...")
        for e in tqdm(text, total=len(text)):
            tmp = self.tokenizer.select_important_tokens(clean_line(e.strip()))
            if len(tmp) < 10: continue
            important_tokens.append(tmp)
        # 采样正负样本,同一个文本中的词为正样本,不同文本中的词为负样本
        print("sample(1+k negative) train and valid set...")
        num_neg = min(len(important_tokens) - 1, MAX_NUM_NEG)
        for cur_index, cur_ele in tqdm(enumerate(important_tokens),
                                       total=len(important_tokens)):
            np.random.shuffle(cur_ele)
            cut_index = int(len(cur_ele) / 3)
            lhs, rhs = cur_ele[:cut_index], cur_ele[cut_index:]
            for word_index, word in enumerate(lhs):
                if word in sample_set: continue
                positive_entity = rhs[word_index]  # 正样本
                # 负采样
                negative_entitys, negs = [], []
                negative_indexes = [
                    i for i in range(len(important_tokens)) if i != cur_index
                ]
                random.shuffle(negative_indexes)
                for e in negative_indexes:
                    if (len(negs) >= num_neg): break
                    if word in important_tokens[
                            e] or positive_entity in important_tokens[e]:
                        continue
                    negs.append(e)
                for neg_index in negs:
                    while True:
                        neg_tmp = random.sample(important_tokens[neg_index],
                                                1)[0]
                        if neg_tmp != word and neg_tmp not in negative_entitys:
                            break
                    negative_entitys.append(neg_tmp)
                assert len(negative_entitys) == num_neg
                # 采样数少的情况下进行填充
                #if len(negative_entitys) < num_neg:
                #    negative_entitys += ["PAD"] * (num_neg - len(negative_entitys))
                sample_set[word] = [positive_entity, negative_entitys]
        # 产生字典
        token_freq = defaultdict(int)
        token_freq["UNKNOWN"] = 1e8
        #token_freq["PAD"] = 1e8-1
        for k, (p, n) in sample_set.items():
            tmp = [k, p] + n
            for t in tmp:
                if re_en.fullmatch(t): token_freq[t] += 1
                else:
                    for e in list(t):
                        token_freq[e] += 1
        sorted_token_freq = sorted(token_freq.items(),
                                   key=lambda d: d[1],
                                   reverse=True)[:VOCAB_SIZE]
        word2id = {w: i for i, (w, f) in enumerate(sorted_token_freq)}
        if conf.over_write_vocab:
            print("generate word2id file: %s" % (conf.vocab))
            json.dump(word2id,
                      open(conf.vocab, "w", encoding="utf8"),
                      ensure_ascii=False,
                      indent=2)
        _keys_ = list(sample_set.keys())
        train_set = {
            k: sample_set[k]
            for k in _keys_[:int(len(_keys_) * conf.train_valid_ratio)]
        }
        valid_set = {
            k: sample_set[k]
            for k in _keys_[int(len(_keys_) * conf.train_valid_ratio):]
        }
        print("total_sample: %d\ttrain_sample: %d\tvalid_sample :%d" %
              (len(sample_set), len(train_set), len(valid_set)))
        print("generate train sample file :%s\tvalid sample file: %s" %
              (conf.train_samples, conf.valid_samples))
        json.dump(train_set,
                  open(conf.train_samples, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)
        json.dump(valid_set,
                  open(conf.valid_samples, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)

    def gen_vocab(self, title2entitys):
        token_freq = defaultdict(int)
        token_freq["UNKNOWN"] = 1e8
        for title, entitys in title2entitys.items():
            line = [title] + entitys
            for t in line:
                if re_en.fullmatch(t): token_freq[t] += 1
                else:
                    for e in list(t):
                        token_freq[e] += 1
        sorted_token_freq = sorted(token_freq.items(),
                                   key=lambda d: d[1],
                                   reverse=True)[:VOCAB_SIZE]
        word2id = {w: i for i, (w, f) in enumerate(sorted_token_freq)}
        print("generate word2id file: %s" % (conf.vocab))
        json.dump(word2id,
                  open(conf.vocab, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)

    def gen_train_sample_based_title_desc(self):
        entity_dicts = {
            line.strip(): 1
            for line in open(conf.new_entity_file,
                             encoding="utf8").readlines()
        }
        valid_titles = {
            line.strip(): 1
            for line in open("data/valid_titles", encoding="utf8").readlines()
        }
        title_entitys, entity_title, sample_set = {}, {}, []
        matchObj = re.compile(r'(.+)&([0-9]+)', re.M | re.I)
        title2entitys = {line.strip().lower().split('\t')[0]: line.strip().lower().split('\t')[1:] \
          for line in open("data/cv_title2entitys_corpu", encoding="utf8").readlines()}
        title_entitys = {
            k: v
            for k, v in title2entitys.items() if len(v) >= 10 and len(v) < 20
        }
        if conf.over_write_vocab: self.gen_vocab(title_entitys)
        _keys_ = list(title_entitys.keys())
        print("sample(1+k negative) train and valid set...")
        num_neg = min(len(title_entitys) - 1, MAX_NUM_NEG)
        # 采样
        for title, entitys in tqdm(title_entitys.items(),
                                   total=len(title_entitys)):
            positive_entitys = random.sample(entitys, min(len(entitys),
                                                          10))  # 正样本
            negative_titles_candidate = [e for e in _keys_ if e != title]
            for pos_entity in positive_entitys:  # 负样本
                negative_entitys = []
                negs = random.sample(negative_titles_candidate, num_neg)
                for neg_tit in negs:
                    try:
                        negative_entitys.append(
                            random.sample(title_entitys[neg_tit], 1)[0])
                    except:
                        a = 1
                if len(negative_entitys) < num_neg:
                    negative_entitys += [negative_entitys[0]
                                         ] * (num_neg - len(negative_entitys))
                assert len(negative_entitys) == num_neg
                sample_set.append([title, pos_entity, list(negative_entitys)])
        #exit()
        train_set = {
            i: ele
            for i, ele in enumerate(
                sample_set[:int(len(sample_set) * conf.train_valid_ratio)])
        }
        valid_set = {
            i: ele
            for i, ele in enumerate(
                sample_set[int(len(sample_set) * conf.train_valid_ratio):])
        }
        print("total_sample: %d\ttrain_sample: %d\tvalid_sample :%d" %
              (len(sample_set), len(train_set), len(valid_set)))
        print("generate train sample file :%s\tvalid sample file: %s" %
              (FLAGS.train_samples, FLAGS.valid_samples))
        json.dump(train_set,
                  open(FLAGS.train_samples, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)
        json.dump(valid_set,
                  open(FLAGS.valid_samples, "w", encoding="utf8"),
                  ensure_ascii=False,
                  indent=2)
        a = 1