def initQA(mybot): # 初始化jb分词器 T.jieba_initialize() # 切换到语料库所在工作目录 mybot_path = './' os.chdir(mybot_path) mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/QA/resources/std-startup.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/QA/resources/bye.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/QA/resources/tools.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/QA/resources/bad.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/QA/resources/funny.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/QA/resources/OrdinaryQuestion.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/QA/resources/Common conversation.aiml") mybot.respond('Load Doc Snake')
def query(entity, attr): soup = To.get_html_baidu("http://baike.baidu.com/item/" + entity) basicInfo_block = soup.find(class_='basic-info cmn-clearfix') if basicInfo_block == None: # print 'info None' return attr + "::找不到" else: info = get_info(basicInfo_block) # for i in info: # print i # print info[i] # print '-----------' if info.has_key(attr.decode('utf8')): # print 'has key'+attr.decode('utf8') return info[attr.decode('utf8')] else: # print 'no key 进行同义词判断' # 同义词判断 attr_list = T.load_baikeattr_name( os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) + '/resources/Attribute_name.txt') attr = T.load_synonyms_word_inattr( attr, os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) + '/resources/SynonDic.txt', attr_list) if info.has_key(attr.decode('utf8')): return info[attr.decode('utf8')] else: return attr + "::找不到"
def run(query): #if __name__ == '__main__': #初始化jb分词器 T.jieba_initialize() #切换到语料库所在工作目录 mybot_path = './' os.chdir(mybot_path) mybot = aiml.Kernel() mybot.learn(os.path.split(os.path.realpath(__file__))[0]+"/resources/std-startup.xml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") if len(query) > 60: answer = '句子长度过长' elif query.strip() == '': answer = mybot.respond('无') else: message = T.wordSegment(query) words = T.postag(query) response = mybot.respond(message) if response == '': answer = mybot.respond('找不到答案') elif response[0] == '#': if response.__contains__("searchbaike"): res = response.split(':') entity = str(res[1]).replace(" ","") attr = str(res[2]).replace(" ","") ans = baike.query(entity, attr) if '找不到' not in ans: answer = ans elif ans.__contains__('找不到'): answer = search_summary.kwquery(query) if len(answer) == 0: answer = mybot.respond('找不到答案') elif len(answer) == 1: print(answer) answer = answer[0].strip().replace(' ','').replace("\n","") else: answer = '找不到答案' else: answer = '找不到答案' else: answer = search_summary.kwquery(query) if len(answer) == 0: answer = mybot.respond('找不到答案') elif len(answer) == 1: answer = answer[0].strip().replace(' ','').replace("\n","") else: answer = '找不到答案' return answer
def query(entity,attr): soup = To.get_html_baike("http://baike.baidu.com/item/"+entity) basicInfo_block = soup.find(class_= 'basic-info cmn-clearfix') if not basicInfo_block: return '找不到' else: info = get_info(basicInfo_block) ##print(info) if info.get(attr): return info[attr].strip() else: attr_list = T.load_baikeattr_name(os.path.dirname(os.path.split(os.path.realpath(__file__))[0])+'/resources/Attribute_name.txt') attr = T.load_synonyms_word_inattr(attr,os.path.dirname(os.path.split(os.path.realpath(__file__))[0])+'/resources/SynonDic.txt',attr_list) if info.get(attr): return info[attr].strip() else: return '找不到'
#coding:utf8 import aiml import os, sys from QA.QACrawler import baike from QA.Tools import Html_Tools as QAT from QA.Tools import TextProcess as T from QACrawler import search_summary if __name__ == '__main__': #初始化jb分词器 T.jieba_initialize() #切换到语料库所在工作目录 mybot_path = './' os.chdir(mybot_path) mybot = aiml.Kernel() mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "./resources/std-startup.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "./resources/Common conversation.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "./resources/bye.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "./resources/tools.aiml") mybot.learn(
def ws(): user_socket = request.environ.get('wsgi.websocket') # type:WebSocket while 1: msg =user_socket.receive() question = json.loads(msg) q = question['data']['mine']['content'] msg =q input_message = str(msg).encode('utf-8') if len(input_message) > 60: answer = mybot.respond("句子长度过长") # continue elif input_message.strip() == '': answer = mybot.respond("无话可说") # continue # print input_message message = T.wordSegment(input_message) # 去标点 # print 'word Seg:'+ message # print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) # 在AIML数据集里寻找答案 print "=======" if response[0] == '#': print response + 'mark' else: answer = response print "=======" if response == "": ans = mybot.respond('找不到答案') answer = ans # 百科搜索 elif response[0] == '#' or len(response) < 1: # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') # 实体 entity = str(res[1]).replace(" ", "") # 属性 attr = str(res[2]).replace(" ", "") print entity + '<---->' + attr ans = baike.query(entity, attr) # 如果命中答案 print type(ans) if type(ans) == list: answer = '回答:' + QAT.ptranswer(ans, False) # continue elif ans.decode('utf-8').__contains__(u'::找不到'): # 百度摘要+Bing摘要 print "通用搜索" answer = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) print type(ans) if len(ans) == 0: ans = mybot.respond('找不到答案') answer = '回答:' + ans elif len(ans) > 1: print "不确定候选答案" answer = ans[0] print 'Eric: ' for a in ans: print a.encode("utf-8") else: answer = '回答:' + ans[0].encode("utf-8") # 匹配模版 else: answer = '回答:' + response s = '展开全部' print type(answer).__name__ if (type(answer).__name__ == 'list' and len(answer)>0) or '唔... 怎么回答...'in answer or '天气' in msg: answer = geta(msg) answer = answer print 'wocao' else: #print answer if s in str(answer): print answer answer = str(answer).replace('\n', '').replace('展开全部', "").split('已赞过')[0] print 'OS' +answer print user_socket,msg res =answer a = { "username": "******", "avatar": "https://robot.rszhang.top/images/icon/nv/0.jpg", "id": "-2", # //消息的来源ID(如果是私聊,则是用户id,如果是群聊,则是群组id) "type": "friend", # //聊天窗口来源类型,从发送消息传递的to里面获取 "content": res, # //消息内容 "cid": 0, # //消息id,可不传。除非你要对消息进行一些操作(如撤回) "mine": True, # //是否我发送的消息,如果为true,则会显示在右方 "fromid": "100000", # /消息的发送者id(比如群组中的某个消息发送者),可用于自动解决浏览器多窗口时的一些问题 "timestamp": 1467475443306, # //服务端时间戳毫秒数。注意:如果你返回的是标准的 unix 时间戳,记得要 *1000 } user_socket.send(json.dumps(a))
import sys reload(sys) import json from QA.Tools.tuling import geta from flask import Flask, render_template, request from geventwebsocket.handler import WebSocketHandler from geventwebsocket.websocket import WebSocket from gevent.pywsgi import WSGIServer from QA.QACrawler import baike from QA.Tools import Html_Tools as QAT from QA.Tools import TextProcess as T#文字处理 from QA.QACrawler import search_summary # 初始化jb分词器 T.jieba_initialize() # 切换到语料库所在工作目录 mybot = aiml.Kernel() # if os.path.isfile("bot_brain.brn"): # mybot.bootstrap(brainFile="bot_brain.brn") # else: # # mybot.saveBrain("bot_brain.brn") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/std-startup.xml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tuling.xml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/test.xml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "QA/resources/bye.aiml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "QA/resources/tools.aiml")
def kwquery(query): #分词 去停用词 抽取关键词 keywords = [] words = T.postag(query) for k in words: # 只保留名词 if k.flag.__contains__("n"): # print k.flag # print k.word keywords.append(k.word) answer = [] text = '' # 找到百科的答案就置1 flag = 0 # 抓取百度前10条的摘要 soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd=' + quote(query)) for i in range(1, 10): if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: print "百度摘要找不到答案" break # print '=============' # print results.attrs # print type(results.attrs) # print results['class'] #判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案 if results.attrs.has_key('mu') and i == 1: # print results.attrs["mu"] r = results.find(class_='op_exactqa_s_answer') # print r if r == None: print "百度知识图谱找不到答案" # continue else: # print r.get_text() print "百度知识图谱找到答案" answer.append(r.get_text().strip()) flag = 1 break #古诗词判断 if results.attrs.has_key('mu') and i == 1: r = results.find(class_="op_exactqa_detail_s_answer") if r == None: print "百度诗词找不到答案" # continue else: # print r.get_text() print "百度诗词找到答案" answer.append(r.get_text().strip()) flag = 1 break #计算器 if results.attrs.has_key( 'mu') and i == 1 and results.attrs['mu'].__contains__( 'http://open.baidu.com/static/calculator/calculator.html'): r = results.find('div').find_all('td')[1].find_all('div')[1] if r == None: print "计算器找不到答案" # continue else: # print r.get_text() print "计算器找到答案" answer.append(r.get_text().strip()) flag = 1 break # 百度知道答案 if results.attrs.has_key('mu') and i == 1: r = results.find(class_='op_best_answer_question_link') if r == None: print "百度知道图谱找不到答案" else: print "百度知道图谱找到答案" url = r['href'] zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer').find('pre') answer.append(r.get_text()) flag = 1 break if results.find("h3") != None: # 百度知道 if results.find("h3").find("a").get_text().__contains__( u"百度知道") and i == 1: url = results.find("h3").find("a")['href'] if url == None: print "百度知道图谱找不到答案" continue else: print "百度知道图谱找到答案" zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer') if r == None: continue else: r = r.find('pre') answer.append(r.get_text().strip()) flag = 1 break # 百度百科 if results.find("h3").find("a").get_text().__contains__( u"百度百科") and i == 1: url = results.find("h3").find("a")['href'] if url == None: print "百度百科找不到答案" continue else: print "百度百科找到答案" baike_soup = To.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break text += results.get_text() if flag == 1: return answer #获取bing的摘要 soup_bing = To.get_html_bing('https://www.bing.com/search?q=' + quote(query)) # 判断是否在Bing的知识图谱中 # bingbaike = soup_bing.find(class_="b_xlText b_emphText") bingbaike = soup_bing.find(class_="bm_box") if bingbaike != None: if bingbaike.find_all(class_="b_vList")[1] != None: if bingbaike.find_all(class_="b_vList")[1].find("li") != None: print "Bing知识图谱找到答案" flag = 1 answer.append(bingbaike.get_text()) # print "=====" # print answer # print "=====" return answer else: print "Bing知识图谱找不到答案" results = soup_bing.find(id="b_results") bing_list = results.find_all('li') for bl in bing_list: temp = bl.get_text() if temp.__contains__(u" - 必应网典"): print "查找Bing网典" url = bl.find("h2").find("a")['href'] if url == None: print "Bing网典找不到答案" continue else: print "Bing网典找到答案" bingwd_soup = To.get_html_bingwd(url) r = bingwd_soup.find(class_='bk_card_desc').find("p") if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break if flag == 1: return answer text += results.get_text() # print text # 如果再两家搜索引擎的知识图谱中都没找到答案,那么就分析摘要 if flag == 0: #分句 cutlist = [u"。", u"?", u".", u"_", u"-", u":", u"!", u"?"] temp = '' sentences = [] for i in range(0, len(text)): if text[i] in cutlist: if temp == '': continue else: # print temp sentences.append(temp) temp = '' else: temp += text[i] # 找到含有关键词的句子,去除无关的句子 key_sentences = {} for s in sentences: for k in keywords: if k in s: key_sentences[s] = 1 # 根据问题制定规则 # 识别人名 target_list = {} for ks in key_sentences: # print ks words = T.postag(ks) for w in words: # print "=====" # print w.word if w.flag == ("nr"): if target_list.has_key(w.word): target_list[w.word] += 1 else: target_list[w.word] = 1 # 找出最大词频 sorted_lists = sorted(target_list.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) # print len(target_list) #去除问句中的关键词 sorted_lists2 = [] # 候选队列 for i, st in enumerate(sorted_lists): # print st[0] if st[0] in keywords: continue else: sorted_lists2.append(st) print "返回前n个词频" answer = [] for i, st in enumerate(sorted_lists2): # print st[0] # print st[1] if i < 3: # print st[0] # print st[1] answer.append(st[0]) # print answer return answer
def kwquery(query): #分词 去停用词 抽取关键词 keywords = [] words = T.postag(query) for k in words: # 只保留名词 if k.flag.__contains__("n"): # print k.flag # print k.word keywords.append(k.word) answer = [] text = '' # 找到答案就置1 flag = 0 # 抓取百度前10条的摘要 soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd='+quote(query)) for i in range(1,10): if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: print "百度摘要找不到答案" break # print '=============' # print results.attrs # print type(results.attrs) # print results['class'] #判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案 if results.attrs.has_key('mu') and i == 1: # print results.attrs["mu"] r = results.find(class_='op_exactqa_s_answer') if r == None: print "百度知识图谱找不到答案" else: # print r.get_text() print "百度知识图谱找到答案" answer.append(r.get_text().strip()) flag = 1 break #古诗词判断 if results.attrs.has_key('mu') and i == 1: r = results.find(class_="op_exactqa_detail_s_answer") if r == None: print "百度诗词找不到答案" else: # print r.get_text() print "百度诗词找到答案" answer.append(r.get_text().strip()) flag = 1 break #万年历 & 日期 if results.attrs.has_key('mu') and i == 1 and results.attrs['mu'].__contains__('http://open.baidu.com/calendar'): r = results.find(class_="op-calendar-content") if r == None: print "百度万年历找不到答案" else: # print r.get_text() print "百度万年历找到答案" answer.append(r.get_text().strip().replace("\n","").replace(" ","")) flag = 1 break if results.attrs.has_key('tpl') and i == 1 and results.attrs['tpl'].__contains__('calendar_new'): r = results.attrs['fk'].replace("6018_","") print r if r == None: print "百度万年历新版找不到答案" # continue else: # print r.get_text() print "百度万年历新版找到答案" answer.append(r) flag = 1 break #计算器 if results.attrs.has_key('mu') and i == 1 and results.attrs['mu'].__contains__('http://open.baidu.com/static/calculator/calculator.html'): r = results.find('div').find_all('td')[1].find_all('div')[1] if r == None: print "计算器找不到答案" # continue else: # print r.get_text() print "计算器找到答案" answer.append(r.get_text().strip()) flag = 1 break # 百度知道答案 if results.attrs.has_key('mu') and i == 1: r = results.find(class_='op_best_answer_question_link') if r == None: print "百度知道图谱找不到答案" else: print "百度知道图谱找到答案" url = r['href'] zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer').find('pre') if r == None: r = zhidao_soup.find(class_='bd answer').find(class_='line content') answer.append(r.get_text()) flag = 1 break if results.find("h3") != None: # 百度知道 if results.find("h3").find("a").get_text().__contains__(u"百度知道") and (i == 1 or i ==2): url = results.find("h3").find("a")['href'] if url == None: print "百度知道图谱找不到答案" continue else: print "百度知道图谱找到答案" zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer') if r == None: continue else: r = r.find('pre') if r == None : r = zhidao_soup.find(class_='bd answer').find(class_='line content') answer.append(r.get_text().strip()) flag = 1 break # 百度百科 if results.find("h3").find("a").get_text().__contains__(u"百度百科") and (i == 1 or i ==2): url = results.find("h3").find("a")['href'] if url == None: print "百度百科找不到答案" continue else: print "百度百科找到答案" baike_soup = To.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n","").strip() answer.append(r) flag = 1 break text += results.get_text() if flag == 1: return answer #获取bing的摘要 soup_bing = To.get_html_bing('https://www.bing.com/search?q='+quote(query)) # 判断是否在Bing的知识图谱中 # bingbaike = soup_bing.find(class_="b_xlText b_emphText") bingbaike = soup_bing.find(class_="bm_box") if bingbaike != None: if bingbaike.find_all(class_="b_vList")[1] != None: if bingbaike.find_all(class_="b_vList")[1].find("li") != None: print "Bing知识图谱找到答案" flag = 1 answer.append(bingbaike.get_text()) # print "=====" # print answer # print "=====" return answer else: print "Bing知识图谱找不到答案" results = soup_bing.find(id="b_results") bing_list = results.find_all('li') for bl in bing_list: temp = bl.get_text() if temp.__contains__(u" - 必应网典"): print "查找Bing网典" url = bl.find("h2").find("a")['href'] if url == None: print "Bing网典找不到答案" continue else: print "Bing网典找到答案" bingwd_soup = To.get_html_bingwd(url) r = bingwd_soup.find(class_='bk_card_desc').find("p") if r == None: continue else: r = r.get_text().replace("\n","").strip() answer.append(r) flag = 1 break if flag == 1: return answer text += results.get_text() # print text # 如果再两家搜索引擎的知识图谱中都没找到答案,那么就分析摘要 if flag == 0: #分句 cutlist = [u"。",u"?",u".", u"_", u"-",u":",u"!",u"?"] temp = '' sentences = [] for i in range(0,len(text)): if text[i] in cutlist: if temp == '': continue else: # print temp sentences.append(temp) temp = '' else: temp += text[i] # 找到含有关键词的句子,去除无关的句子 key_sentences = {} for s in sentences: for k in keywords: if k in s: key_sentences[s]=1 # 根据问题制定规则 # 识别人名 target_list = {} for ks in key_sentences: # print ks words = T.postag(ks) for w in words: # print "=====" # print w.word if w.flag == ("nr"): if target_list.has_key(w.word): target_list[w.word] += 1 else: target_list[w.word] = 1 # 找出最大词频 sorted_lists = sorted(target_list.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) # print len(target_list) #去除问句中的关键词 sorted_lists2 = [] # 候选队列 for i, st in enumerate(sorted_lists): # print st[0] if st[0] in keywords: continue else: sorted_lists2.append(st) print "返回前n个词频" answer = [] for i,st in enumerate(sorted_lists2): # print st[0] # print st[1] if i< 3: # print st[0] # print st[1] answer.append(st[0]) # print answer return answer
def answer(question): if len(question) > 600: print mybot.respond("句子长度过长") raise Exception("Too Long") elif question.strip() == '': print mybot.respond("无") raise Exception("No Input") print question message = T.wordSegment(question) # 去标点 print 'word Seg:'+ message print '词性:' words = T.postag(question) if message == 'q': exit() else: response = mybot.respond(message) print response if response == "": raise Exception("No Answer") # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') #实体 entity = str(res[1]).replace(" ","") #属性 attr = str(res[2]).replace(" ","") print entity+'<---->'+attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans,False) return [QAT.ptranswer(ans,False)] elif ans.decode('utf-8').__contains__(u'::找不到'): #百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(question) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(question) if len(ans) == 0: raise Exception("No Answer") elif len(ans) >1: print "不确定候选答案" print 'Eric: ' for a in ans: print a.encode("utf8") return [a.encode("utf8") for a in ans] else: print 'Eric:' + ans[0].encode("utf8") return [ans[0].encode("utf8")] # 匹配模版 else: print 'Eric:' + response return [response]
#coding:utf8 import aiml import os, sys from QA.QACrawler import baike from QA.Tools import Html_Tools as QAT from QA.Tools import TextProcess as T from QACrawler import search_summary if __name__ == '__main__': #初始化jb分词器 T.jieba_initialize() #切换到语料库所在工作目录 mybot_path = './' os.chdir(mybot_path) mybot = aiml.Kernel() mybot.learn(os.path.split(os.path.realpath(__file__))[0]+"/resources/std-startup.xml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") # mybot.respond('Load Doc Snake') #载入百科属性列表 print '''
def run(question): # 初始化jb分词器 T.jieba_initialize() # 切换到语料库所在工作目录 mybot_path = './' os.chdir(mybot_path) mybot = aiml.Kernel() if os.path.isfile("bot_brain.brn"): mybot.bootstrap(brainFile="bot_brain.brn") else: mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/std-startup.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/tuling.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") #mybot.bootstrap(learnFiles="std-startup.xml", commands="load aiml b") mybot.saveBrain("bot_brain.brn") # 载入百科属性列表 print ''' Eric:你好,我是问答机器人。╭(╯^╰)╮ ''' input_message = question if len(input_message) > 60: return mybot.respond("句子长度过长") #continue elif input_message.strip() == '': return mybot.respond("无话可说") #continue # print input_message message = T.wordSegment(input_message) # 去标点 # print 'word Seg:'+ message # print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) # 在AIML数据集里寻找答案 print "=======" if response[0] == '#': print response + 'mark' pass else: return response print "=======" if response == "": ans = mybot.respond('找不到答案') print 'Eric:' + ans # 百科搜索 elif response[0] == '#' or len(response) < 1: # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') # 实体 entity = str(res[1]).replace(" ", "") # 属性 attr = str(res[2]).replace(" ", "") print entity + '<---->' + attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: return '回答:' + QAT.ptranswer(ans, False) #continue elif ans.decode('utf-8').__contains__(u'::找不到'): # 百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') return '回答:' + ans elif len(ans) > 1: print "不确定候选答案" return ans print 'Eric: ' for a in ans: print a.encode("utf-8") else: return '回答:' + ans[0].encode("utf-8") # 匹配模版 else: return '回答:' + response
def handle(self): conn = self.request conn.sendall('欢迎访问智能百科问答系统') Flag =True data = conn.recv(4096) while Flag: input_message = data print "input_message=====" print input_message print "==========" reply = '' if len(input_message) > 60: print mybot.respond("句子长度过长") reply = mybot.respond("句子长度过长") conn.sendall(reply) Flag =False continue elif input_message.strip() == '无': print mybot.respond("无") reply = mybot.respond("无") conn.sendall(reply) Flag = False continue print input_message message = T.wordSegment(input_message) # 去标点 print 'word Seg:' + message print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) print "=======" print response print "=======" if response == "": ans = mybot.respond('找不到答案') print 'Eric:' + ans reply = mybot.respond('找不到答案') conn.sendall(reply) Flag = False # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') # 实体 entity = str(res[1]).replace(" ", "") # 属性 attr = str(res[2]).replace(" ", "") print entity + '<---->' + attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans, False) reply = QAT.ptranswer(ans, False) conn.sendall(reply) Flag = False continue elif ans.decode('utf-8').__contains__(u'::找不到'): # 百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') print 'Eric:' + ans reply = ans conn.sendall(reply) Flag = False elif len(ans) > 1: print "不确定候选答案" print 'Eric: ' for a in ans: print a.encode("utf8") reply += a.encode("utf8") + '\n' conn.sendall(reply) Flag = False else: print 'Eric:' + ans[0].encode("utf8") reply = ans[0].encode("utf8") conn.sendall(reply) Flag = False # 匹配模版 else: print 'Eric:' + response reply = response conn.sendall(reply) Flag = False
def qa(question): #初始化jb分词器 T.jieba_initialize() #切换到语料库所在工作目录 mybot_path = './' # os.chdir(mybot_path) mybot = aiml.Kernel() mybot.learn(os.path.split(os.path.realpath(__file__))[0]+"/resources/std-startup.xml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") # mybot.respond('Load Doc Snake') #载入百科属性列表 print ''' .----------------. .-----------------. .----------------. .----------------. .----------------. | .--------------. || .--------------. || .--------------. || .--------------. || .--------------. | | | _______ | || | ____ _____ | || | __ | || | ___ ____ | || | _________ | | | | / ___ | | || ||_ \|_ _| | || | / \ | || | |_ ||_ _| | || | |_ ___ | | | | | | (__ \_| | || | | \ | | | || | / /\ \ | || | | |_/ / | || | | |_ \_| | | | | '.___`-. | || | | |\ \| | | || | / /__\ \ | || | | __'. | || | | _| _ | | | | |`\____) | | || | _| |_\ |_ | || | _/ / \ \_ | || | _| | \ \_ | || | _| |___/ | | | | | |_______.' | || ||_____|\____| | || ||____| |____|| || | |____||____| | || | |_________| | | | | | || | | || | | || | | || | | | | '--------------' || '--------------' || '--------------' || '--------------' || '--------------' | '----------------' '----------------' '----------------' '----------------' '----------------' Eric:你好,我是Eric。╭(╯^╰)╮ ''' input_message = question if len(input_message) > 60: print mybot.respond("句子长度过长") elif input_message.strip() == '': print mybot.respond("无") print input_message message = T.wordSegment(input_message) # 去标点 print 'word Seg:'+ message print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) print "=======" print response print "=======" if response == "": ans = mybot.respond('找不到答案') print 'Eric:' + ans # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') #实体 entity = str(res[1]).replace(" ","") #属性 attr = str(res[2]).replace(" ","") print entity+'<---->'+attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans,False) elif ans.decode('utf-8').__contains__(u'::找不到'): #百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') print 'Eric:' + ans elif len(ans) >1: print "不确定候选答案" print 'Eric: ' for a in ans: print a.encode("utf8") else: print 'Eric:' + ans[0].encode("utf8") # 匹配模版 else: print 'Eric:' + response
def QA(input_message, mybot): findAns = False reply = '' ansdict = {} dbname = 'zwgx' #数据库名 dbip = 'localhost' #数据库IPlocalhost dbport = 3306 #数据库端口 dbusername = '******' #数据库用户名 dbpassword = '******' #数据库密码root schoolname = '' intention = '' if len(input_message) > 60: reply = mybot.respond("句子长度过长") findAns = True elif input_message.strip() == '无': reply = mybot.respond("无") findAns = True if (findAns == False): #print input_message #传入一个b''未加工的对象 message = T.wordSegment(input_message) # 分词去标点 if message == 'q': exit() else: print 'word Seg:' + message print utf_to_bgk('词性:') words = T.postag(input_message) #词性标注 格式 词/词性 for w in words: print w.word, w.flag if w.flag == 'school': try: #先用自定义的分词处理得到对应的词性,然后根据词性到数据库查询 db = pymysql.connect(host=dbip, user=dbusername, passwd=dbpassword, db=dbname, charset="utf8") cursor = db.cursor() sql = u"SELECT `学校名` FROM 学校简称 WHERE `简称`='" + w.word + "'" # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表,已验证数据库查询跑通 results = cursor.fetchall() #print results #替换简称 if len(results) > 0: input_message = input_message.replace( w.word, results[0][0]).__str__() w.flag = 'nt' w.word = results[0][0] print utf_to_bgk(input_message), utf_to_bgk( w.word), utf_to_bgk(w.flag) # 关闭数据库连接 db.close() except Exception as e: print(e) # 识别学校简称并配对数据库中已存内容 if w.flag == 'x' or w.flag == 'nt': try: db = pymysql.connect(host=dbip, user=dbusername, passwd=dbpassword, db=dbname, charset="utf8") cursor = db.cursor() sql = u"SELECT `属性`,`内容` FROM school WHERE `学校`='" + w.word + "'" # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 results = cursor.fetchall() #print u'flag转化之后的查询',results if len(results) > 0: for row in results: ansdict[row[0]] = row[1] #print row[0],row[1] # reply +=row[0].encode("utf8") # reply+=" ".encode("utf8") # shuxing=raw_input('Frank:你想了解什么属性 ' + reply+">>") # sql = u"SELECT `内容` FROM school WHERE `学校`='" + w.word + u"'AND `属性`='"+shuxing+"'" # cursor.execute(sql) # results = cursor.fetchall() # if len(results)>0: # print "Frank: "+results[0][0].encode("utf8") # reply=results[0][0].encode("utf8") # return reply # 关闭数据库连接 #print u'查询之后的结果储存',ansdict 将数据库中的所有信息写入到ansdict中 db.close() except Exception as e: print(e) #todo: 每个词去找查数据库可以优化一下 加一下词性判断 #获得学校的名称 if FindSchool(dbip, dbusername, dbpassword, dbname, w.word) != "": schoolname = FindSchool(dbip, dbusername, dbpassword, dbname, w.word) uni = input_message.strip().decode('utf-8') print u'查看返回值', uni, utf_to_bgk(input_message.strip()) response = mybot.respond( input_message.strip()) #如果未给传入参数转化为utf8则报错 print "=======" #print response print "=======+" if response == "": reply = mybot.respond('找不到答案') findAns = True print 'Frank1:' + utf_to_bgk(reply) # ********************************************************************************* # 百科搜索 aiml机器人没有没有 elif response[0] == '#': # 匹配百科 # if response.__contains__("searchbaike"): # print "searchbaike" # print response # res = response.split(':') # # 实体 # entity = str(res[1]).replace(" ", "") # # 属性 # attr = str(res[2]).replace(" ", "") # print entity + '<---->' + attr # # ans = baike.query(entity, attr) # # 如果命中答案 # if type(ans) == list: # print 'Frank:' + QAT.ptranswer(ans, False) # reply = QAT.ptranswer(ans, False) # findAns = True # elif ans.decode('utf-8').__contains__(u'::找不到'): # # 百度摘要+Bing摘要 # print "通用搜索" # ans = search_summary.kwquery(input_message) # # # 匹配不到模版,通用查询 # elif response.__contains__("NoMatchingTemplate"): # print "NoMatchingTemplate" # #当复杂问题时,通过分类器模型进行分类再查询 if (schoolname != ""): sock = socket(AF_INET, SOCK_STREAM) sock.connect(('127.0.0.1', 50009)) sock.sendall(input_message.encode("utf-8")) intention = sock.recv(1024) sock.close() print utf_to_bgk(intention), u'经过分类器处理后的结果' #经过dl识别后分类问题,如果问题在数据库中,即把问题分类为数据库的一个属性,再调用属性值,可以增加数据库的属性分类和值 if unicode(intention) in ansdict: reply = ansdict[unicode(intention)] #print 'Frank:' + reply.encode("utf8") #print 'Frank2:' + utf_to_bgk(reply) #如果问题没有在数据库预存储 else: TempDict = search_summary.kwquery(input_message, intention, schoolname) ansdict['schoolname'] = TempDict['schoolname'] ansdict['intention'] = TempDict['intention'] ansdict['index'] = TempDict['index'] ans = TempDict['answer'] if (findAns == False): if len(ans) == 0: ans = mybot.respond('找不到答案') #print 'Frank3:' + utf_to_bgk(ans) reply = ans findAns = True elif len(ans) > 1: print u"不确定候选答案" print 'Frank4: ' for a in ans: print a.encode("utf8") reply += a.encode("utf8") + '\n' findAns = True else: #print 'Frank5:' + ans[0].encode("utf8") reply = ans[0].encode("utf8") findAns = True # 匹配模版 else: print 'Frank6:' + utf_to_bgk(response) reply = response findAns = True ansdict['baidu'] = reply json_s = json.dumps(ansdict) return json_s
def QA(input_message, mybot): findAns = False reply = '' ansdict = {} dbname = 'zwgx' #数据库名 dbip = '106.14.124.221' #数据库IPlocalhost dbport = 3306 #数据库端口 dbusername = '******' #数据库用户名 dbpassword = '******' #数据库密码root schoolname = '' intention = '' if len(input_message) > 60: reply = mybot.respond("句子长度过长") findAns = True elif input_message.strip() == '无': reply = mybot.respond("无") findAns = True if (findAns == False): # print input_message message = T.wordSegment(input_message) # 分词去标点 if message == 'q': exit() else: print 'word Seg:' + message print '词性:' words = T.postag(input_message) for w in words: print w.word, w.flag if w.flag == 'school': try: db = pymysql.connect(host=dbip, user=dbusername, passwd=dbpassword, db=dbname, charset="utf8") cursor = db.cursor() sql = u"SELECT `学校名` FROM 学校简称 WHERE `简称`='" + w.word + "'" # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 results = cursor.fetchall() #替换简称 if len(results) > 0: input_message = input_message.replace( w.word, results[0][0]).__str__() w.flag = 'nt' w.word = results[0][0] # 关闭数据库连接 db.close() except Exception as e: print(e) # 识别学校简称并配对数据库中已存内容 if w.flag == 'x' or w.flag == 'nt': try: db = pymysql.connect(host=dbip, user=dbusername, passwd=dbpassword, db=dbname, charset="utf8") cursor = db.cursor() sql = u"SELECT `属性`,`内容` FROM school WHERE `学校`='" + w.word + "'" # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 results = cursor.fetchall() if len(results) > 0: for row in results: ansdict[row[0]] = row[1] #print row[0],row[1] # reply +=row[0].encode("utf8") # reply+=" ".encode("utf8") # shuxing=raw_input('Frank:你想了解什么属性 ' + reply+">>") # sql = u"SELECT `内容` FROM school WHERE `学校`='" + w.word + u"'AND `属性`='"+shuxing+"'" # cursor.execute(sql) # results = cursor.fetchall() # if len(results)>0: # print "Frank: "+results[0][0].encode("utf8") # reply=results[0][0].encode("utf8") # return reply # 关闭数据库连接 db.close() except Exception as e: print(e) #todo: 每个词去找查数据库可以优化一下 加一下词性判断 if FindSchool(dbip, dbusername, dbpassword, dbname, w.word) != "": schoolname = FindSchool(dbip, dbusername, dbpassword, dbname, w.word) response = mybot.respond(input_message.strip()) print "=======" print response print "=======" if response == "": reply = mybot.respond('找不到答案') findAns = True print 'Frank:' + reply # ********************************************************************************* # 百科搜索 elif response[0] == '#': # 匹配百科 # if response.__contains__("searchbaike"): # print "searchbaike" # print response # res = response.split(':') # # 实体 # entity = str(res[1]).replace(" ", "") # # 属性 # attr = str(res[2]).replace(" ", "") # print entity + '<---->' + attr # # ans = baike.query(entity, attr) # # 如果命中答案 # if type(ans) == list: # print 'Frank:' + QAT.ptranswer(ans, False) # reply = QAT.ptranswer(ans, False) # findAns = True # elif ans.decode('utf-8').__contains__(u'::找不到'): # # 百度摘要+Bing摘要 # print "通用搜索" # ans = search_summary.kwquery(input_message) # # # 匹配不到模版,通用查询 # elif response.__contains__("NoMatchingTemplate"): # print "NoMatchingTemplate" if (schoolname != ""): sock = socket(AF_INET, SOCK_STREAM) sock.connect(('127.0.0.1', 50009)) sock.sendall(input_message.encode("utf-8")) intention = sock.recv(1024) sock.close() print intention if unicode(intention) in ansdict: reply = ansdict[unicode(intention)] print 'Frank:' + reply.encode("utf8") else: TempDict = search_summary.kwquery(input_message, intention, schoolname) ansdict['schoolname'] = TempDict['schoolname'] ansdict['intention'] = TempDict['intention'] ansdict['index'] = TempDict['index'] ans = TempDict['answer'] #********************************************************************************* if (findAns == False): if len(ans) == 0: ans = mybot.respond('找不到答案') print 'Frank:' + ans reply = ans findAns = True elif len(ans) > 1: print "不确定候选答案" print 'Frank: ' for a in ans: print a.encode("utf8") reply += a.encode("utf8") + '\n' findAns = True else: print 'Frank:' + ans[0].encode("utf8") reply = ans[0].encode("utf8") findAns = True # 匹配模版 else: print 'Frank:' + response reply = response findAns = True ansdict['baidu'] = reply json_s = json.dumps(ansdict) return json_s
def qa(): #初始化jb分词器 T.jieba_initialize() #切换到语料库所在工作目录 mybot_path = './' # os.chdir(mybot_path) mybot = aiml.Kernel() mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/std-startup.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") # mybot.respond('Load Doc Snake') #载入百科属性列表 print ''' .----------------. .-----------------. .----------------. .----------------. .----------------. | .--------------. || .--------------. || .--------------. || .--------------. || .--------------. | | | _______ | || | ____ _____ | || | __ | || | ___ ____ | || | _________ | | | | / ___ | | || ||_ \|_ _| | || | / \ | || | |_ ||_ _| | || | |_ ___ | | | | | | (__ \_| | || | | \ | | | || | / /\ \ | || | | |_/ / | || | | |_ \_| | | | | '.___`-. | || | | |\ \| | | || | / /__\ \ | || | | __'. | || | | _| _ | | | | |`\____) | | || | _| |_\ |_ | || | _/ / \ \_ | || | _| | \ \_ | || | _| |___/ | | | | | |_______.' | || ||_____|\____| | || ||____| |____|| || | |____||____| | || | |_________| | | | | | || | | || | | || | | || | | | | '--------------' || '--------------' || '--------------' || '--------------' || '--------------' | '----------------' '----------------' '----------------' '----------------' '----------------' Eric:你好,我是Eric。╭(╯^╰)╮ ''' while True: input_message = raw_input("Enter your message >> ") if len(input_message) > 60: print mybot.respond("句子长度过长") continue elif input_message.strip() == '': print mybot.respond("无") continue print input_message message = T.wordSegment(input_message) # 去标点 print 'word Seg:' + message print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) print "=======" print response print "=======" if response == "": ans = mybot.respond('找不到答案') print 'Eric:' + ans # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') #实体 entity = str(res[1]).replace(" ", "") #属性 attr = str(res[2]).replace(" ", "") print entity + '<---->' + attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans, False) continue elif ans.decode('utf-8').__contains__(u'::找不到'): #百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') print 'Eric:' + ans elif len(ans) > 1: print "不确定候选答案" print 'Eric: ' for a in ans: print a.encode("utf8") else: print 'Eric:' + ans[0].encode("utf8") # 匹配模版 else: print 'Eric:' + response
def kwquery(query): #分词 去停用词 抽取关键词 keywords = [] words = T.postag(query) for k in words: #print(k) # 只保留名词 if k.flag.__contains__("n"): # #print k.flag # #print k.word keywords.append(k.word) answer = [] text = '' # 找到答案就置1 flag = 0 # 抓取百度前10条的摘要 soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd=' + quote(query)) for i in range(1, 10): if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: ##print("百度百科找不到答案") break if results.attrs.get('mu') and i == 1: # #print results.attrs["mu"] r = results.find(class_='op_exactqa_s_answer') if r == None: pass ##print("百度知识图谱找不到答案") else: ##print("百度知识图谱找到答案") answer.append(r.get_text().strip()) flag = 1 break if flag == 0 and len(keywords) > 1: ##print(keywords) ans_lst = list(kgquery_entity(keywords[0], keywords[1])) if len(ans_lst) != 0: answer = ans_lst flag = 1 if results.find("h3") != None and flag == 0: if results.find("h3").find("a").get_text().__contains__( u"百度百科") and (i == 1 or i == 2): url = results.find("h3").find("a")['href'] if url == None: ##print("百度百科找不到答案") continue else: #print("百度百科找到答案") baike_soup = To.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break text += results.get_text().strip() if flag == 0: #分句 cutlist = [u"。", u"?", u".", u"_", u"-", u":", u"!", u"?"] temp = '' sentences = [] for i in range(0, len(text)): if text[i] in cutlist: if temp == '': continue else: # #print temp sentences.append(temp) temp = '' else: temp += text[i] # 找到含有关键词的句子,去除无关的句子 key_sentences = {} for s in sentences: for k in keywords: if k in s: key_sentences[s] = 1 # 识别人名 target_list = {} for ks in key_sentences: # #print ks words = T.postag(ks) for w in words: # #print "=====" # #print w.word if w.flag == ("nr"): if target_list.get(w.word): target_list[w.word] += 1 else: target_list[w.word] = 1 sorted_lists = sorted(target_list.items(), key=operator.itemgetter(1), reverse=True) sorted_lists2 = [] # 候选队列 for i, st in enumerate(sorted_lists): # #print st[0] if st[0] in keywords: continue else: sorted_lists2.append(st) #print("返回前3个词频") answer = [] for i, st in enumerate(sorted_lists2): # #print st[0] # #print st[1] if i < 3: # #print st[0] # #print st[1] answer.append(st[0]) ##print(answer) return answer