def baidu_ym(question): question = question.replace('Ж', '_') baidu_rus = bdzdym.baiduyemian(question) papa = baidu_rus.strip().replace('\ufeff', '').replace( '\u3000', '').replace('\xa0', '').replace('\n', '').replace('\r', '').replace('\\', '') print('页面::', papa) gqsavetojson.gqtojson(papa, question) aresult, b_b = run_mrc.abc() baidu_a = aresult['text'].strip().replace('\'', '').replace('\r', '').replace( '\n', '').replace('\"', '') baidu_par = papa[0:1000] #太长就留1000字吧 if baidu_a in mrc_p: baidu_a = '' return baidu_a, baidu_par
def test_zyb(question): a1 = time.time() # 初始计时 #time.sleep(0.3) # 设置问题间隔时间 # 1、清理问题 question = question.replace('+', '') question = clear_input(question) # 将问题疑问部分转换为Ж print('清理过字符的问题==', question) # 2、 与现有问题库进行相似度计算 similar_a, similar_par = similar(question) similar_a = '' if similar_a != '': # 在3000中有相似并成功计算的的则不为空,直接返回答案 aa = similar_a paqu = similar_par fromwhere = '从相似度计算得来' a = correction(question, aa) # 修正后答案 daily(question, paqu, aa, a, fromwhere) # 保存日志 a2 = time.time() print('相似度计算计算耗时==', str(a2 - a1), a) return a else: # 3、百度知道计算 if question.find('Ж') == -1: question += ",_" print('qqqqqqqqqqqqq', question) # bdzd_para = bdzdqa.Query(question)['Answer'] # 百科知道问答的爬取和计算 bdzd_para0 = bdzdym.sim_baidu(question) # 百科知道问答的爬取和计算 bzz = bdzd_para0.strip().replace('\ufeff', '').replace( '\u3000', '').replace('\xa0', '').replace('\n', '').replace('\r', '').replace('\\', '') fromwhere = '从百度知道得来' if len(bzz) < 100: bzz = bdzdym.baiduyemian(question.replace('Ж', '_')).replace( '\ufeff', '').replace('\u3000', '').replace('\xa0', '').replace( '\n', '').replace('\r', '').replace('\\', '').strip() print('搜搜爬取', bzz[:1500]) # 太长就留1500字吧 fromwhere = '从soso页面得来' sp_bzz = re.split(r'[。;!]', bzz) bdzd_a = '' print(bzz) print(len(sp_bzz), '个大句子 ', len(bzz), '字!') YNQ = None #判别是非问题 for YNR in YN_list: YNQ = re.search(re.compile(YNR), question) if YNQ: break if YNQ: #对Yes_No问题,先找依据再回答,共四次预测 for tail in YN_tails: gqtojson(bzz[:800] + YN_text, question + tail) an, an1 = run_mrc.abc( init_check='data/models/step_30000') #换模型 if an['text'].find('没有答案') == -1: bdzd_a += '小呆找依据:' + an['text'] + '...' try: gqtojson(bzz[:min(800, len(bzz))] + YN_text, question) yn, yns = run_mrc.abc(init_check='data/models/step_30000') if yn['text'] in ['(是的)', '(不是)', '(不一定)']: bdzd_a += '小萌说了算:' + yn['text'] + '##' except Exception as e: print(e) print(yn, yns) print('YN******', bdzd_a) else: gqtojson(bzz[:min(1500, len(bzz))], question) an, ans = run_mrc.abc() if len(re.split(r'[。;!]', an['text'])) > 2 or len( an['text']) > 100: gqtojson(an['text'], question) an, ans = run_mrc.abc() print(an, '\n', ans) bdzd_a += '小呆会翻书:' + an['text'] + '##' if an['text'] in mrc_p: wh = 0 while an['text'] not in mrc_p or wh < len(ans): print('whian::', ans[wh]['text']) bdzd_a = ans[wh]['text'] wh += 1 gqtojson(bzz[:min(1500, len(bzz))], an['text'] + '_') an1, ans1 = run_mrc.abc() print(an1, '\n', ans1) bdzd_a = bdzd_a[:-2] + '...小萌插一句:' + an1['text'] + '##' print("bdzd_a+++", bdzd_a) if bdzd_a != '': # 百度页面成功解析并输出答案 a = correction(question, bdzd_a) # 修正后答案 daily(question, bzz, bdzd_a, a, fromwhere) # 保存日志 a2 = time.time() print('baidu页面计算耗时', str(a2 - a1), a) return a else: a = '算了半天还是没有答案' baidu_par = '瞅瞅爬到啥' + str(baidu_par) fromwhere = '哪儿都没找到' daily(question, baidu_par, a, fromwhere) # 保存日志 a2 = time.time() print('计算耗时', str(a2 - a1), a) return a
def similar(question): # 保存的文件。开始应该初始化为空list文件[], # print('file_save_list', file_save_list) file_save = open('data/newlist.txt', 'r', encoding='UTF-8').read() file_save_list = eval(file_save.replace('\ufeff', '')) s_q = question same = [] s_q = s_q.strip('\n') for qu in file_save_list: qu = qu.strip('\n') cut = s_q[0:3] if cut.find('Ж') > -1: cut = s_q[-4:] if qu.find(cut) == -1: continue if qu == s_q: same.append(qu) else: part = [None, None, None] p0 = s_q.find('Ж') p1 = qu.find('Ж') ques = [None, None] # ‘Ж’位置在前的放0位,在后放1 if p0 > p1: ques[0] = qu ques[1] = s_q else: ques[1] = qu ques[0] = s_q part[0] = ques[0][:min(p0, p1)] # 两个问句靠前那个Ж之前那部分 part[2] = ques[1][max(p0, p1) + 1:] # 两个问句靠后那个Ж之后那部分 p2 = ques[0].find(part[2]) # Ж靠前的句子,找到靠后Ж之后部分位置 if p2 > -1: part[1] = ques[0][min(p0, p1) + 1:p2] else: # print(part[2], ques[0]) continue p3 = ques[1].find(part[0]) if p3 == -1: # print(part[0],ques[1]) continue p3 = p3 + len(part[0]) # Ж靠后的句子,找到靠前Ж之后部分位置 part1_2 = ques[1][p3:max(p0, p1) - 1] cc_mat = len(part[0]) + len(part[2]) cc = 0 while cc in range(0, len(part[1])): if part1_2.find(part[1][cc]) > -1: cc_mat += 1 cc += 1 if cc_mat > len(qu) * 0.66: same.append(qu) #如果所问问题不在题库里,则讲新问题存入题库 if question not in file_save_list: file_save_list = list(file_save_list) file_save_list.append(question) file_save2 = open('data/newlist.txt', 'w+', encoding='UTF-8') file_save2.write(str(file_save_list)) file_save2.close() # print('len(file_save_list)', len(file_save_list), file_save_list) # if len(Qus)>len(file_save_list): if 0 < len(same): #print('len(same)', question, len(same), same) # if len(same)==0: # 进行模型计算newlist.txt try: if question in same: same.remove(question) papa = same[0] #print('same第一个==',papa) gqtojson(str(papa), question) # 保存test1.json 计算 three_a = run_mrc.abc() print('相似度计算结果==', three_a) three_par = papa except: three_a = '' three_par = '' else: three_a = '' three_par = '' if three_a in mrc_p or 'Ж' in three_a: three_a = '' three_par = '' # print('lastsame=same',same,len(same)) #print('three_a=three_par', three_a, three_par) return three_a, three_par
def test_zyb(question): print('question::::::::::', question) a1 = time.time() # 初始计时 #time.sleep(0.3) # 设置问题间隔时间 # 1、清理问题 question = question.strip('+').replace('+', '') question = clear_input(question) # 将问题疑问部分转换为Ж print('清理过字符的问题==', question) # 2、 与现有问题库进行相似度计算 similar_a, similar_par = similar(question) if similar_a != '': # 在3000中有相似并成功计算的的则不为空,直接返回答案 aa = similar_a paqu = similar_par fromwhere = '从相似度计算得来' a = correction(question, aa) # 修正后答案 daily(question, paqu, aa, a, fromwhere) # 保存日志 a2 = time.time() print('相似度计算计算耗时==', str(a2 - a1), a) return a elif re.search(r'喻体|比喻句|本体', question): gqsavetojson.gqtojson(re.split(r'[,,]', question)[0], question) an, an1 = run_mrc.abc() return an['text'] else: # 3、百度知道计算 bdzd_aa, bdzd_para = bd_ym(question) bdzd_ab, bdzd_parb = baike_js(question) bzz = bdzd_para + bdzd_parb sp_bzz = re.split(r'[。;!]', bzz) print(len(sp_bzz), '个大句子 ', len(bzz), '字!') print(bzz) len_pa = max(len(bzz) / 3, 350) sp_b = '' ans = [] biyu_talk = '' qas_texts = [ '本文中的比喻句是:Ж', '@key,这句话中的喻体是Ж', '@key,这句话的本体是Ж', '@key,这个比喻中的类比属性是Ж' ] for sb in range(0, len(sp_bzz)): big_s = re.split(r'[。!\n]|\.\.\.', sp_bzz[sb]) # 大句子切割 if re_list != []: for big in big_s: for r_b in re_list: biyu = re.search(re.compile(r_b), big) if biyu: b_ana = biyu_ana(biyu, big) if b_ana[0] == '': continue if biyu_talk.find(b_ana[0]) > -1: print('break重复') break evals = ['', '', '', ''] for ii in range(0, 4): b_ana[ii] = b_ana[ii].strip(' ') if b_ana[ii] != '': qii = qas_texts[ii].replace( '@key', b_ana[0]) gqsavetojson.gqtojson(b_ana[-1], qii) an, an1 = run_mrc.abc() evals[ii] = an['text'] if biyu_talk.find(evals[0]) > -1: print('break2......') break biyu_talk += '本文中的比喻句有:Ж' + evals[0] + '\n' if b_ana[1] != '' and b_ana[2] != '': bytk = '把@k1比作@k2'.replace('@k1', evals[2]).replace( '@k2', evals[1]) if b_ana[3] != '': bytk += ',同样@k3'.replace('@k3', evals[3]) biyu_talk += bytk + '\n' print('evals[2]::b_ana[2]', evals[2], len(b_ana[2]), b_ana[2]) try: sp_b += sp_bzz[sb] + '。' if len(sp_b) > len_pa or sb == len(sp_bzz) - 1: gqsavetojson.gqtojson(sp_b, question) an, an1 = run_mrc.abc() print('ananananana', an, an1) an['text'] = an['text'].strip().replace('\'', '').replace( '\r', '').replace('\n', '').replace('\"', '') if an not in ans: ans.append(an) ans.append(an1) sp_b = '' except: continue ans = fix_prob(ans, question, sp_bzz) # 检查答案所在的每个大句子,包含多少个问题的字词和顺序,调整probability评分 ans = sorted( ans, key=lambda x: (x["probability"]), # 多个段落的两个答案,重新按可能性排序 reverse=True) print(len(ans), 'ans::++', ans) if len(ans) > 0: bdzd_a = ans[0]['text'] else: bdzd_a = '' bdzd_par = bzz if bdzd_a in mrc_p: bdzd_a = '' if bdzd_a != '': aa = bdzd_a paqu = bdzd_par fromwhere = '从知道计算得来' a = correction(question, aa) # 修正后答案 daily(question, paqu, aa, a, fromwhere) # 保存日志 a2 = time.time() print('知道计算耗时', str(a2 - a1), a) return '小呆的查询:' + a + '##' else: #4、百度知道没有结果,则进行百度页面爬取 print('爬百度页面::', question) baidu_a, baidu_par = baidu_ym(question) if baidu_a in mrc_p: baidu_a = '' if baidu_a != '': # 百度页面成功解析并输出答案 aa = baidu_a paqu = baidu_par fromwhere = '从baidu页面得来' a = correction(question, aa) # 修正后答案 daily(question, paqu, aa, a, fromwhere) # 保存日志 a2 = time.time() print('baidu页面计算耗时', str(a2 - a1), a) return a else: aa = '算了半天还是没有答案' a = '算了半天还是没有答案' paqu = '瞅瞅爬到啥' + str(baidu_par) fromwhere = '哪儿都没找到' daily(question, paqu, aa, a, fromwhere) # 保存日志 a2 = time.time() print('计算耗时', str(a2 - a1), a) return a
def riji_test(que='你喜欢唱歌跳舞吗', f1='概念.我.xml', no='我所知'): #本模块没用 p_xml = 'D:/YYY/' tree0 = ET.parse(p_xml + f1) root = tree0.getroot() exp_my = [] ws = list(jieba.cut(que)) for exp in root.iter(no): #找到节点参数的大XML myexp = exp break for ex in myexp: sco = 0 if not ex.text: continue topic = ex.tag if topic == '日记': topic += '+' + ex.attrib['标题'] if topic == '象声词': # 先放过 continue sp_text = re.split(r'[。;!\n]', ex.text) ssp = '' for sp in range(len(sp_text)): if len(sp_text[sp]) < 3: continue ssp += sp_text[sp] + '。' if len(ssp) < 300 and sp != len(sp_text) - 1: continue mw = [] for w in ws: if w in ['是', '有', '了', '在', '的', ',', '?']: #排除太常见的词 continue if topic.find(w) > -1: sco += 10 if ex.text.find(w) > -1: if w not in mw: mw.append(w) sco += len(w) if sco > 3: eee = (topic + '\n' + ssp, sco, mw) exp_my.append(eee) ssp = '' sco = 0 exp_my = sorted( exp_my, key=lambda x: (x[1]), # 多个段落的两个答案,重新按可能性排序 reverse=True) print('exp_my::', exp_my[:3]) ans = [] for ex in exp_my[:min(3, len(exp_my))]: gqsavetojson.gqtojson(ex[0], que) an, an1 = run_mrc.abc() an['text'] = an['text'].strip().replace('\'', '').replace( '\r', '').replace('\n', '').replace('\"', '') if an not in ans: ans.append(an) ans.append(an1) ans = sorted( ans, key=lambda x: (x["probability"]), # 多个段落的两个答案,重新按可能性排序 reverse=True) print('ans::', len(ans), ans) return ans[0]['text']