def _prepare_q_a(q_lines: list, a_lines: list, qa_dict: dict): for q, a in zip(q_lines, a_lines): q = q.strip() a = a.strip() qa_dict[q] = dict() # 获取命名实体,就是按词分词后,选择词性为kc(课程)的所有词作为命名实体 ret = cut(q, by_character=False, with_pos=True) # [('产品经理', 'kc'), ('的', 'uj'), ('课程', 'n'), ('有', 'v'), ('什么', 'r'), ('特点', 'n'), ('?', 'x')] qa_dict[q]["entity"] = [i[0] for i in ret if i[1] == "kc" or i[1] == "shisu"] qa_dict[q]["q_cut_by_word"] = [i[0] for i in ret] qa_dict[q]["q_cut_by_char"] = cut(q, by_character=True) qa_dict[q]["answer"] = a
def predict(): s2s.eval() sentence = input("请输入句子: ") # 句子转序列 sentence = cut(sentence, by_character=by_char) s2s_input = config.s2s_input s2s_target = config.s2s_target seq_len = config.seq_len feature = s2s_input.transform(sentence, seq_len) # 构造feature和feature——length feature = torch.LongTensor(feature).to(config.device).unsqueeze(0) feature_length = torch.LongTensor([min(len(sentence), seq_len)]).to(config.device) # 预测 y_predict = s2s.evaluate(feature, feature_length) # 转换 y_predict = y_predict.permute(1, 0, 2) # 取最后一个维度的最大值作为预测的结果 pred = y_predict.argmax(dim=-1) # 转成列表 pred = pred.squeeze().detach().numpy().tolist() # 转成句子 pred = s2s_target.inverse_transform(pred) # 拼接 pred = "".join(pred).split("EOS")[0] print("预测结果为:", pred)
def process_xiaohuangji(f_train, f_test): """处理小黄鸡语料""" num_train = 0 num_test = 0 ret = open(xiaohuangji_path, encoding='UTF-8').readlines() flag = 0 for line in tqdm(ret, desc='小黄鸡'): # 句子长度为1考虑删除 if line.startswith("E"): flag = 0 continue elif line.startswith("M"): if flag == 0: line = line[1:].strip() flag = 1 else: continue line_cuted = cut(line) if not keywoads_in_line((line_cuted)): line_cuted = " ".join(line_cuted) + "\t" + "__label__chat" if random.choice(flags) == 0: num_train += 1 f_train.write(line_cuted + "\n") else: num_test += 1 f_test.write(line_cuted + "\n") return num_train, num_test
def cut_file(file_path, out_path): with open(file_path, 'r') as f: lines = f.readlines() out = '' for line in lines: new_lines = cut(line) out += new_lines with open(out_path, 'w') as f: f.write(out)
def process_byhand_data(file): num = 0 for line in tqdm(open(byhand_path, "r", encoding="utf-8").readlines(), desc="问答对"): line = line.strip() line_cut = cut(line) line_cut = " ".join(line_cut) + "\t" + "__label__QA" num += 1 file.write(line_cut + "\n") return num
def predict(self, sentence): self.s2s.eval() # 句子转序列 sentence = cut(sentence, by_character=by_char) feature = config.s2s_input.transform(sentence, config.seq_len) # 构造feature和feature——length feature = torch.LongTensor(feature).to(config.device).unsqueeze(0) feature_length = torch.LongTensor([min(len(sentence), config.seq_len)]).to(config.device) # 预测 y_predict = self.s2s.evaluate_beam_search(feature, feature_length) # 任选一条转成句子并返回 return "".join(config.s2s_target.inverse_transform(random.choice(y_predict)))
def predict(self, sentence, recall_list): # 将用户提问的问题广播到和recall_list相同的数量 sentence_list = [sentence] * len(recall_list) sentence_cut = [cut(i, by_character=True) for i in sentence_list] recall_cut = [cut(i, by_character=True) for i in recall_list] # [['python', '好', '学', '吗'], ['python', '好', '学', '吗'], ['python', '好', '学', '吗']] # [['python', '难', '吗'], ['蒋', '夏', '梦', '是', '谁'], ['c', '语', '言', '好', '就', '业', '吗']] sentence_cut = [ self.ws.transform(i, config.seq_len) for i in sentence_cut ] recall_cut = [self.ws.transform(i, config.seq_len) for i in recall_cut] q1 = torch.LongTensor(sentence_cut) q2 = torch.LongTensor(recall_cut) out = self.model(q1, q2) # [batch_size, 2] 最后一列是句子匹配的概率 value, index = torch.topk(out[:, -1], k=1, dim=0) value = value.item() index = index.item() # 设置阈值 if value > config.sort_threshold: # 如果符合阈值要求,则返回该问题对应的答案 return self.qa_dict[recall_list[index]]["answer"] else: return "这个问题我也还没学到啊!"
def predict(self, sentence: str): """ 输入问题,返回最相近的问题 :param sentence: 要搜索的问题 :return: """ sentence_cut = [" ".join(cut(sentence, by_character=by_char))] # ['python 真的 很 简单 吗 ?', '什么 是 产品经理 ?'] 以空格作为分隔 search_vector = self.vectorizer.transform(sentence_cut) search_results = self.search_index.search( search_vector, k=config.recall_nums, k_clusters=config.recall_clusters, num_indexes=2, return_distance=True) # [[('0.0', '蒋夏梦是谁?'), ('1.0', 'python真的很简单吗?'), ('1.0', '什么是产品经理?'), ('1.0', '什么样的人适合做产品经理呀?')]] final_result = list() # 过滤实体entity # 获取用户输入的问题中的实体 sentence_cut_with_pos = cut(sentence, by_character=False, with_pos=True) q_entity = [ i[0] for i in sentence_cut_with_pos if i[1] == "kc" or i[1] == "shisu" ] # 判断是否存在相同实体 for i in search_results: for j in i: matched_q = j[1] matched_q_entity = self.s2v.qa_dict[matched_q]["entity"] if len(set(matched_q_entity) & set(q_entity)) > 0: # 集合取交集 final_result.append(matched_q) # 如果存在相同实体,则返回匹配到的存在相同实体的问题 if len(final_result) > 0: return final_result # 如果不存在相同实体,则返回原始结果 else: return [j[1] for j in i for i in search_results]
def process_crawled_corpus(fout_train, fout_test, by_char): """处理爬取的数据""" num = 0 fin = open(config.by_crawl_path, "r").readlines() for line in tqdm(fin, desc="Processing Crawled Corpus"): q = " ".join(cut(line, by_character=by_char)).strip() # 分词 q += "\t__label__QA" if random.randint(0, N + 1) == 0: fout_test.write(q + "\n") else: fout_train.write(q + "\n") num += 1 return num
def extract_and_cut_question(by_char=True): num = 0 fin = open(config.by_hand_path, "r").read() # 读取json文件 fin_dic = json.loads(fin) # 用json直接读也行 with open( config.recall_corpus_by_char if by_char else config.recall_corpus_by_word, "w") as fout: for q_list_list in tqdm(fin_dic.values(), desc="Processing Homemade Corpus"): # 列表中嵌套列表 for q_list in q_list_list: for q in q_list: q = " ".join(cut(q, by_character=by_char)).strip() # 分词 fout.write(q + "\n") num += 1 print(num)
def process_crawled_data(f_train, f_test): """处理爬取的数据""" num_train = 0 num_test = 0 for line in tqdm(open(crawled_path, encoding='UTF-8').readlines(), desc='crawled'): line_cuted = cut(line) line_cuted = " ".join(line_cuted).replace("\n", "") + "\t" + "__label__QA" if random.choice(flags) == 0: num_train += 1 f_train.write(line_cuted + "\n") else: num_test += 1 f_test.write(line_cuted + "\n") return num_train, num_test
def predict_beam_search(): s2s.eval() sentence = input("请输入句子: ") # 句子转序列 sentence = cut(sentence, by_character=by_char) s2s_input = config.s2s_input s2s_target = config.s2s_target seq_len = config.seq_len feature = s2s_input.transform(sentence, seq_len) # 构造feature和feature——length feature = torch.LongTensor(feature).to(config.device).unsqueeze(0) feature_length = torch.LongTensor([min(len(sentence), seq_len)]).to(config.device) # 预测 y_predict = s2s.evaluate_beam_search(feature, feature_length) # 转成句子 for i in y_predict: print("".join(s2s_target.inverse_transform(i)))
def process_by_hand(fout_train, fout_test, by_char): """处理手工构造的句子""" num = 0 fin = open(config.by_hand_path, "r").read() # 读取json文件 # fin_dic = eval(fin) # 将文件转成字典 fin_dic = json.loads(fin) # 用json直接读也行 for q_list_list in tqdm(fin_dic.values(), desc="Processing Homemade Corpus"): # 列表中嵌套列表 for q_list in q_list_list: for q in q_list: if "校区" in q: continue q = " ".join(cut(q, by_character=by_char)).strip() # 分词 q += "\t__label__QA" if random.randint(0, N + 1) == 0: fout_test.write(q + "\n") else: fout_train.write(q + "\n") num += 1 return num
def process_byhand_data(f_train, f_test): """处理手工构造的数据""" num_train = 0 num_test = 0 total_lines = json.loads(open(byhand_path, encoding='UTF-8').read()) for key in (total_lines): for lines in tqdm(total_lines[key], desc='byhand'): for line in lines: # 去除个别不要的问题 if "校区" in line: continue line_cuted = cut(line) line_cuted = " ".join(line_cuted) + "\t" + "__label__QA" if random.choice(flags) == 0: num_train += 1 f_train.write(line_cuted + "\n") else: num_test += 1 f_test.write(line_cuted + "\n") return num_train, num_test
def process_xiaohuangji(fout_train, fout_test, by_char): num = 0 fin = open(config.xiaohuangji_path, "r").readlines() first_m_flag = True # 标志着是否是第一个m for line in tqdm(fin, desc="Processing Xiaohuangji Corpus"): if line.startswith("M"): # 句子以M开头 if first_m_flag: if not keywords_in_line(line): # 句子不包含指定关键词 line = line[2:].strip() # 去除最开始的M if len(line) > 1: # 删去句子长度为1的句子 line_cut = " ".join(cut( line, by_character=by_char)).strip() # 将分词后的结果,用空格连接在一起 line_cut += "\t__label__chat" # 添加类别信息 # 将数据按照4:1的比例分为训练集和测试集 if random.randint(0, N + 1) == 0: fout_test.write(line_cut + "\n") # 加上换行 else: fout_train.write(line_cut + "\n") num += 1 first_m_flag = not first_m_flag return num
def process_xiaohuangji(file): # TODO 句子长度为1,考虑删除 num = 0 for line in tqdm(open(xiaohuangji_path, "r", encoding="utf-8").readlines(), desc="小黄鸡"): if line.startswith("E"): flag = 0 continue elif line.startswith("M"): if flag == 0: line = line[1:].strip() if len(line) == 1: continue flag = 1 else: continue line_cut = cut(line) if not keywords_in_line(line_cut): line_cut = " ".join(line_cut) + "\t" + "__label__chat" num += 1 file.write(line_cut + "\n") return num
def prepare_xiaohuangji(by_char=False): """ 准备小黄鸡问答语料 :param by_char: 是否按照字符切分 """ with open(config.xiaohuangji_path, mode="r", encoding="utf-8") as fin: with open(config.chatbot_input_by_char_path if by_char else config.chatbot_input_by_word_path, mode="w", encoding="utf-8") as f_input: # 存储问 with open(config.chatbot_target_by_char_path if by_char else config.chatbot_target_by_word_path, mode="w", encoding="utf-8") as f_target: # 存储答 text = fin.readlines() num = 0 lines = list() # 临时存储句子 for line in tqdm(text, desc="Processing Xiaohuangji Corpus"): if line.startswith("E"): continue elif line.startswith("M"): lines.append(replace_emoji( line.strip()[2:])) # 删去句首的M,并去掉颜文字 if len(lines) == 2: # 去除符合过滤规则的句子 lines = [ " ".join(cut(i, by_character=by_char)) + "\n" for i in lines if not filter_line(i) ] # 经过筛选后,如果问答都在,则写入文件 if len(lines) == 2: f_input.write(lines[0]) f_target.write(lines[1]) num += 1 # 重新变为空列表 lines = list() print("{} QA Pairs Write".format(num))
from prepar_corpus.prepar_user_dict.test_user_dict import test_user_dict from lib import cut from lib import stopwords if __name__ == '__main__': t = "python难不难,不是很难,哈,啊" print(cut(t, with_sg=False, use_stopwords=True))