def classify_pair_corpus(bert_model): # 数据预处理 from utils.text_tools import text_preprocess, txtRead, txtWrite from conf.path_config import path_webank_sim import random webank_q_2_l = txtRead(path_webank_sim, encodeType='gbk') questions = [] labels = [] for ques_label in webank_q_2_l[1:]: q_2_l = ques_label.split(',') q_1 = q_2_l[0] q_2 = "".join(q_2_l[1:-1]) label = q_2_l[-1] questions.append([text_preprocess(q_1), text_preprocess(q_2)]) label_int = int(label) labels.append([0, 1] if label_int == 1 else [1, 0]) questions = np.array(questions) labels = np.array(labels) index = [i for i in range(len(labels))] random.shuffle(index) questions = questions[index] labels = labels[index] len_train = int(len(labels) * 0.9) train_x, train_y = questions[0:len_train], labels[0:len_train] test_x, test_y = questions[len_train:], labels[len_train:] input_ids, input_masks, input_type_ids = bert_model.process_pair(train_x) input_ids2, input_masks2, input_type_ids2 = bert_model.process_pair(test_x) return train_x, train_y, test_x, test_y, input_ids, input_masks, input_type_ids, input_ids2, input_masks2, input_type_ids2
def chatbot_sentence_vec_by_bert_bertasserver(): """bert encode is used bert as server""" from conf.path_config import chicken_and_gossip_path from bert_serving.client import BertClient from utils.text_tools import txtRead import numpy as np topk = 5 matrix_ques_save_path = "doc_vecs_chicken_and_gossip" questions = txtRead(chicken_and_gossip_path, encodeType='utf-8') ques = [ques.split('\t')[0] for ques in questions][0:100] bc = BertClient(ip = 'localhost') doc_vecs = bc.encode(ques) np.savetxt(matrix_ques_save_path, doc_vecs) # matrix_ques = np.loadtxt(matrix_ques_save_path) while True: query = input('你问: ') query_vec = bc.encode([query])[0] query_bert_vec = np.array(query_bert_vec) # compute normalized dot product as score score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1) topk_idx = np.argsort(score)[::-1][:topk] for idx in topk_idx: print('小姜机器人回答: %s\t%s' % (score[idx], questions[idx]))
def cut_td_idf(sources_path, target_path): """ 结巴切词,汉语 :param path: :return: """ print("cut_td_idf start! ") corpus = txtRead(sources_path) governments = [] for corpus_one in corpus: corpus_one_clear = corpus_one.replace(' ', '').strip() ques_q2b = strQ2B(corpus_one_clear.strip()) ques_q2b_syboml = get_syboml(ques_q2b) governments.append(ques_q2b_syboml.strip()) government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments)) topic_ques_all = [] for topic_ques_one in government_ques: top_ques_aqlq = topic_ques_one.replace(' ', ' ').replace( ' ', ' ').strip() + '\n' topic_ques_all.append(top_ques_aqlq) txtWrite(topic_ques_all, target_path) print("cut_td_idf ok! " + sources_path)
def tok_td_idf(data_path): if os.path.exists(data_path + 'td_idf_cut.csv'): '''#计算TD-DIDF,获取训练测试数据''' datas = txtRead(data_path + 'td_idf_cut.csv') # 默认值只匹配长度≥2的单词,修改为1;ngram_range特征所以有2个词的,总计词语50428个 # vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000) vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000) vec_tdidf.fit_transform(datas) file_vec_tdidf = open(data_path + 'td_idf_cut_model.pkl', 'wb') pickle.dump(vec_tdidf, file_vec_tdidf) return vec_tdidf
def init_tfidf_chinese_or_pinyin(sources_path): """ 构建td_idf :param path: :return: """ questions = txtRead(sources_path) corpora_documents = [] for item_text in questions: item_seg = list(jieba.cut(str(item_text).strip())) corpora_documents.append(item_seg) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] tfidf_model = models.TfidfModel(corpus) print("init_tfidf_chinese_or_pinyin ok! " + sources_path) file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb') pickle.dump([dictionary, tfidf_model], file)
def creat_train_data_of_cg_corpus(limit=50, x_limit=2, y_limit=2): x_datas = [] y_datas = [] max_len = 0 sim_ali_web_gov_dli_datas = txtRead(chicken_and_gossip_path, encodeType="utf-8") for sim_ali_web_gov_dli_datas_one in sim_ali_web_gov_dli_datas[1:]: if sim_ali_web_gov_dli_datas_one: sim_ali_web_gov_dli_datas_one_split = sim_ali_web_gov_dli_datas_one.strip( ).split("\t") if len(sim_ali_web_gov_dli_datas_one_split) == 2: # if sim_ali_web_gov_dli_datas_one_split[2]=="1": len_x1 = len(sim_ali_web_gov_dli_datas_one_split[0]) len_x2 = len(sim_ali_web_gov_dli_datas_one_split[1]) # if max_len < len_x1 or max_len < len_x2: max_len = max(len_x1, len_x2, max_len) sentence_org = regular(sim_ali_web_gov_dli_datas_one_split[0], limit=limit) sentence_sim = regular(sim_ali_web_gov_dli_datas_one_split[1], limit=limit) x_datas.append([sen for sen in sentence_org]) y_datas.append([sen for sen in sentence_sim]) # x_datas.append([sen for sen in sentence_sim]) # y_datas.append([sen for sen in sentence_org]) datas = list(zip(x_datas, y_datas)) datas = [(x, y) for x, y in datas if len(x) < limit and len(y) < limit and len(y) >= y_limit and len(x) >= x_limit] x_datas, y_datas = zip(*datas) print('fit word_sequence') ws_input = WordSequence() ws_input.fit(x_datas + y_datas) print('dump') pickle.dump((x_datas, y_datas), open(chatbot_data_cg_xy_anti, 'wb')) pickle.dump(ws_input, open(chatbot_data_cg_ws_anti, 'wb')) print('done') print(max_len)
def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_word): """ 创建问题句向量,设置sen_count=10000, 防止内存不够奔溃 :param sen_count: int, write sentence_encode num per twice :param word2vec_model: model :param qa_path: str :param matrix_ques_path: str :return: """ if os.path.exists(matrix_ques_path_word): file_matrix_ques = open(matrix_ques_path_word, 'rb') matrix_ques = pickle.load(file_matrix_ques) return matrix_ques print('create_matrix_org_pkl start!') qa_dail = txtRead(qa_path, encodeType='utf-8') # questions = [] matrix_ques = [] count = 0 for qa_dail_one in qa_dail: ques = getChinese(qa_dail_one.split('\t')[0]) # questions.append(ques) word_list, flag_list = word_segment_process(ques) sentence_vec = encoding_question(word2vec_model, word_list, flag_list) matrix_ques.append(sentence_vec) if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0: print("count: " + str(count)) count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt", matrix_ques) matrix_ques = [] # break count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt", matrix_ques) # matrix_ques = [] # file_matrix_ques = open(matrix_ques_path, 'wb') # pickle.dump(matrix_ques, file_matrix_ques) print('create_matrix_org_np ok!')
def chatbot_sentence_vec_by_bert_own(): """bert encode is writted by my own""" from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector from conf.path_config import chicken_and_gossip_path from utils.text_tools import txtRead import numpy as np # 读取数据和一些参数,这里只取了100个标准问题 topk = 5 matrix_ques_save_path = "doc_vecs_chicken_and_gossip" questions = txtRead(chicken_and_gossip_path, encodeType='utf-8') ques = [ques.split('\t')[0] for ques in questions][0:100] # 生成标准问题的bert句向量 bert_vector = KerasBertVector() ques_basic_vecs = bert_vector.bert_encode(ques) # 线上你可以生成,直接调用,然后直接load就好 np.savetxt(matrix_ques_save_path, ques_basic_vecs) # matrix_ques = np.loadtxt(matrix_ques_save_path) query_bert_vec = bert_vector.bert_encode(["小姜机器人是什么"])[0] query_bert_vec = np.array(query_bert_vec) print(query_bert_vec) # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了 qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1) topk_idx = np.argsort(qq_score)[::-1][:topk] for idx in topk_idx: print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx])) while True: print("你的问题:") query = input() query_bert_vec = bert_vector.bert_encode([query])[0] query_bert_vec = np.array(query_bert_vec) # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了 qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1) topk_idx = np.argsort(qq_score)[::-1][:topk] for idx in topk_idx: print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx]))
def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path): """ 创建问题句向量 :param sen_count: int :param word2vec_model: gensim model :param qa_path: str :param matrix_ques_path:str :return: None """ if os.path.exists(matrix_ques_path): file_matrix_ques = open(matrix_ques_path, 'rb') matrix_ques = pickle.load(file_matrix_ques) return matrix_ques print('create_matrix_org_pkl start!') qa_dail = txtRead(qa_path, encodeType='utf-8') # questions = [] matrix_ques = [] count = 0 for qa_dail_one in qa_dail: ques = getChinese(qa_dail_one.split('\t')[0]) char_list = [ques_char for ques_char in ques] sentence_vec = question_encoding(word2vec_model, char_list) matrix_ques.append(sentence_vec) if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0: print("count: " + str(count)) count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_char/" + str(count) + ".txt", matrix_ques) matrix_ques = [] break count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_char/" + str(count) + ".txt", matrix_ques) print('create_matrix_org_pkl ok!')
def classify_pair_corpus_webank(bert_model, path_webank): # 数据预处理 from utils.text_tools import text_preprocess, txtRead, txtWrite import random webank_q_2_l = txtRead(path_webank, encodeType='utf-8') questions = [] labels = [] for ques_label in webank_q_2_l[1:]: q_2_l = ques_label.split(',') q_1 = q_2_l[0] q_2 = "".join(q_2_l[1:-1]) label = q_2_l[-1] questions.append([text_preprocess(q_1), text_preprocess(q_2)]) label_int = int(label) labels.append([0, 1] if label_int == 1 else [1, 0]) questions = np.array(questions) labels = np.array(labels) input_ids, input_masks, input_type_ids = bert_model.process_pair(questions) return questions, labels, input_ids, input_masks, input_type_ids
def cut_td_idf_pinyin(sources_path, target_path): # 获取拼音 """ 汉语转拼音 :param path: :return: """ pin = xpinyin.Pinyin() corpus = txtRead(sources_path) topic_ques_all = [] corpus_count = 0 for corpus_one in corpus: corpus_count += 1 # time1 = time.time() corpus_one_clear = corpus_one.replace(' ', '').strip() ques_q2b = strQ2B(corpus_one_clear.strip()) ques_q2b_syboml = get_syboml(ques_q2b) ques_q2b_syboml_pinying = pin.get_pinyin( ques_q2b_syboml.replace(' ', '').replace(' ', '').strip(), ' ') topic_ques_all.append(ques_q2b_syboml_pinying + '\n') # time2 = time.time() # print(str(corpus_count) + 'time:' + str(time2 - time1)) txtWrite(topic_ques_all, target_path) print("cut_td_idf_pinyin ok! " + sources_path)
# break count += 1 np.savetxt( projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt", matrix_ques) # matrix_ques = [] # file_matrix_ques = open(matrix_ques_path, 'wb') # pickle.dump(matrix_ques, file_matrix_ques) print('create_matrix_org_np ok!') # return matrix_ques if __name__ == '__main__': # 读取问答语料 syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8') # 读取词向量,w2v_model_wiki_word_path数据是自己训练的,w2v_model_merge_short_path只取了部分数据,你可以前往下载 if os.path.exists(w2v_model_wiki_word_path): word2vec_model = load_word2vec_model(w2v_model_wiki_word_path, limit=None) print("load w2v_model_wiki_word_path ok!") else: word2vec_model = load_word2vec_model(w2v_model_merge_short_path, limit=None) print("load w2v_model_merge_short_path ok!") # 创建标准问答中问题的句向量,存起来,到matrix_ques_path if not os.path.exists(matrix_ques_part_path): create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model,
def statistics_keyword_by_label(path, rate=1): """ judge is total chinese or not, 判断是不是全是数字 Args: path: str, eg. "train.json" rate: float, eg. 0.75 Returns: None """ datas = txtRead(path) lwd = {} for i in tqdm(range(len(datas)), desc="jieba cut and statistics: "): # 从标准文档里边获取文本, 切词处理 d = datas[i] d_json = json.loads(d) text = d_json.get("x", {}).get("text") label = d_json.get("y") word_list = list(jieba.cut(text)) # 去除 停用词、全数字、1个字 word_list = [ wl for wl in word_list if wl not in stop_words and not is_total_number(wl) and len(wl) >= 2 ] # 词频统计(类别内) word_freq_dict = dict(Counter(word_list)) if label not in lwd: lwd[label] = word_freq_dict else: lwd[label].update(word_freq_dict) # 取范围, 排序 lwd_keys = list(lwd.keys()) lwd_soft = [ sorted(lwd[l].items(), key=lambda x: x[1], reverse=True) for l in lwd_keys ] lwd_soft_rate = [s[:int(len(s) * rate)] for s in lwd_soft] label_word_dict = { lwd_keys[i]: OrderedDict(lwd_soft_rate[i]) for i in range(len(lwd_keys)) } print("cut ok!") # 获取每个类独有的词汇 label_keys = set(list(label_word_dict.keys())) label_words = {} for key in label_keys: key_dict = set(list(label_word_dict[key].keys())) keys_other = copy.deepcopy(label_keys) keys_other.discard(key) # 其他类别的所有词汇 kos = set() for ko in keys_other: ko_dict = set(list(label_word_dict[ko].keys())) kos = kos | ko_dict # 获取独有的词汇 key_public = kos & key_dict key_label = key_dict - key_public label_word_freq = {kl: label_word_dict[key][kl] for kl in key_label} label_words[key] = label_word_freq save_json(label_words, "label_keyword_unique.json")
end_time3 = time.time() # print('end_time1: ' + str(end_time1 - start_time)) # print('end_time2: ' + str(end_time2 - start_time)) # print('end_time3: ' + str(end_time3 - start_time)) return result # [fuzz.WRatio, fuzz.QRatio, # fuzz.token_set_ratio, fuzz.token_sort_ratio, # fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio, # fuzz.UWRatio, fuzz.UQRatio] if __name__ == '__main__': start_time = time.time() qa_list = txtRead(chicken_and_gossip_path) questions = [qa.strip().split("\t")[0] for qa in qa_list] print("read questions ok!") sen = "你谁呀" # list_fuzzyfinder = fuzzyfinder(base_syn_one_split[1], qa_list) # list_fuzzyfinder = fuzzy_fuzzywuzzy(fuzz, base_syn_one_split[1], qa_list) print("你问: " + "你谁呀") list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, sen, qa_list, questions, topn=5) print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip()) print("推荐结果: ") print(list_fuzzyfinder)
gen_all_syn = [] for generated_hot_one in generated_hot: generated_hot_one_1 = [generated_hot_one] generated_str = generate_random_select(generated_hot_one_1, model_txt, twice=1000, len_min=5) if generated_str: gen_all_syn = gen_all_syn + generated_str # 提取原句中没有的部分 gen_all_syn = list(set(gen_all_syn)) # 生成句子与原句的交集 syn_intersection = list(set(sentence_list).intersection(set(gen_all_syn))) # 生成句子减去交集 gen_syns = list(set(gen_all_syn).difference(set(syn_intersection))) return gen_syns if __name__ == "__main__": # 读取一个文件,再生成句子 txt_path = chicken_and_gossip_path sentence_list = txtRead(txt_path) sentence_list = sentence_list[0:100] enhance_texts = generate_syns_from_list(sentence_list, begin_word="tfidf", p=0.1) for enhance_texts_one in enhance_texts: try: print(enhance_texts_one) except Exception as e: print(str(e))