def matcherTesting(matcherType, removeStopWords=False): matcher = getMatcher(matcherType, removeStopWords) while True: query = input("隨便說些什麼吧: ") title, index = matcher.match(query) sim = matcher.getSimilarity() print("最為相似的標題是 %s ,相似度為 %d " % (title, sim)) res = json.load( open( os.path.join("data/processed/reply/", str(int(index / 1000)) + ".json"), "r", encoding="utf-8", ) ) targetId = index % 1000 # randomId = random.choice(res[targetId]) evaluator = Evaluator() candiates = evaluator.getBestResponse( responses=res[targetId], topk=5, debugMode=False ) print("以下是相似度前 5 高的回應") for candiate in candiates: print("%s %f" % (candiate[0], candiate[1]))
def __init__(self): self.matcher = match.getMatcher("Fuzzy") self.evaluator = Evaluator() self.testSegment() self.defaultResponse = [ "你在說什麼呢?", "我不太明白你的意思" ]
def __init__(self,match_type="bm25"): self.matcher = match.getMatcher(match_type) self.evaluator = Evaluator() self.testSegment() self.defaultResponse = [ "你在說什麼呢?", "我不太明白你的意思" ]
def __init__(self, match_type="bm25"): self.matcher = match.getMatcher(match_type) self.evaluator = Evaluator() self.testSegment() self.defaultResonse = [ "?", "小哥哥,小哥哥,你在说什么啊?", "嗯" ]
def matcherTesting(matcherType, removeStopWords=False): matcher = getMatcher(matcherType, removeStopWords) while True: query = input("随便聊聊?: ") title, index = matcher.match(query) sim = matcher.getSimilarity() print("最为相似的标题是 %s, 相似度为 %d " % (title, sim)) res = json.load(open(os.path.join("../knowledge_base/processed/reply/", str(int(index/1000))+'.json'), 'r', encoding='utf-8')) targetId = index % 1000 evaluator = Evaluator() candidates = evaluator.getBestResponse(responses=res[targetId], topk=5, debugMode=False) print("以下是相似度前5的回复:") for candidate in candidates: print("%s %f" % (candidate[0], candidate[1]))
class GossipBot(object): """ 八卦板聊天機器人 ob'_'ov """ def __init__(self, match_type="bm25"): self.matcher = match.getMatcher(match_type) self.evaluator = Evaluator() self.testSegment() self.defaultResponse = ["你在說什麼呢?", "我不太明白你的意思"] def testSegment(self): logging.info("測試斷詞模塊中") try: self.matcher.wordSegmentation("測試一下斷詞") logging.info("測試成功") except Exception as e: logging.info(repr(e)) logging.info("模塊載入失敗,請確認data與字典齊全") def chatTime(self): print("MianBot: 您好,我是你的老朋友眠寶,讓我們來聊聊八卦吧 o_o ") while True: query = input("User: "******"MianBot: " + self.getResponse(query)) def getResponse(self, query, threshold=50): title, index = self.matcher.match(query) sim = self.matcher.getSimilarity() if sim < threshold: return self.defaultResponse[random.randrange( 0, len(self.defaultResponse))] else: res = json.load( open(os.path.join("data/processed/reply/", str(int(index / 1000)) + '.json'), 'r', encoding='utf-8')) targetId = index % 1000 candiates = self.evaluator.getBestResponse(res[targetId], topk=3) reply = self.randomPick(candiates) return reply def randomPick(self, answers): try: answer = answers[random.randrange(0, len(answers))][0] except: answer = "沒有資料" return answer def randomTalks(self, num=100): with open("data/Titles.txt", 'r', encoding='utf-8') as data: titles = [line.strip('\n') for line in data] for i in range(0, num): query = titles[random.randrange(0, len(titles))] print("User: "******"MianBot: " + self.getResponse(query) + "\n")
class GossipBot(object): """ 八卦板聊天機器人 ob'_'ov """ def __init__(self): self.matcher = match.getMatcher("Fuzzy") self.evaluator = Evaluator() self.testSegment() self.defaultResponse = [ "你在說什麼呢?", "我不太明白你的意思" ] def testSegment(self): logging.info("測試斷詞模塊中") try: self.matcher.wordSegmentation("測試一下斷詞") logging.info("測試成功") except Exception as e: logging.info(repr(e)) logging.info("模塊載入失敗,請確認data與字典齊全") def chatTime(self): print("MianBot: 您好,我是你的老朋友眠寶,讓我們來聊聊八卦吧 o_o ") while True: query = input("User: "******"MianBot: " +self.getResponse(query)) def getResponse(self,query,threshold=50): title,index = self.matcher.match(query) sim = self.matcher.getSimilarity() if sim < threshold: return self.defaultResponse[random.randrange(0,len(self.defaultResponse))] else: res = json.load(open(os.path.join("data/processed/reply/",str(int(index/1000))+'.json'),'r',encoding='utf-8')) targetId = index % 1000 candiates = self.evaluator.getBestResponse(res[targetId],topk=3) reply = self.randomPick(candiates) return reply def randomPick(self, answers): try: answer = answers[random.randrange(0,len(answers))][0] except: answer = "沒有資料" return answer def randomTalks(self, num=100): with open("data/Titles.txt",'r',encoding='utf-8') as data: titles = [line.strip('\n') for line in data] for i in range(0,num): query = titles[random.randrange(0,len(titles))] print("User: "******"MianBot: " +self.getResponse(query) + "\n")
class chatBot(object): ''' momo chat ''' def __init__(self, match_type="bm25"): self.matcher = match.getMatcher(match_type) self.evaluator = Evaluator() self.testSegment() self.defaultResonse = [ "?", "小哥哥,小哥哥,你在说什么啊?", "嗯" ] def testSegment(self): logging.info("测试断词模块") try: self.matcher.wordSegmentation("测试断词") logging.info("测试成功") except Exception as e: logging.info(repr(e)) logging.info("模块载入失败,请确认字典齐全") def chatTime(self): print("幂酱:废话少说,有钱有车有房嘛?") while True: query = input("User: "******"幂酱: " + self.getResponse(query)) def getResponse(self, query, threshold=50): title, idx = self.matcher.match(query) sim = self.matcher.getSimilarity() if sim < threshold: return self.defaultReponse[random.randrange(0, len(self.defaultResponse))] else: res = json.load(open(os.path.join("data/processed/reply/", str(int(index/1000)) + '.json'),'r',encoding='utf-8')) targetId = idx % 1000 candidates = self.evaluator.getBestResponse(res[targetId],topk=3) reply = self.randomPick(candidates) return reply def randomPick(self, answers): try: answer = answers[random.randrange(0,len(answers))][0] except: answer = "404 Not Found" return answer def randomTalks(self, num=100): with open("data/Titles.txt", 'r', encoding='utf-8') as data: titles = [line.strip('\n') for line in data] for i range(0, num): query = titles[random.randrange(0, len(titles))] print("User: "******"幂酱:" + self.getResponse(query) + "\n")
def corpusGenerator(matcherType, removeStopWords=False): matcher = getMatcher(matcherType, removeStopWords) index = 0 with open("data/Titles.txt", 'r', encoding='utf-8') as data: for line in data: # query = input("隨便說些什麼吧: ") query = line.strip('\n') # title,index = matcher.match(query) # sim = matcher.getSimilarity() # print("最為相似的標題是 %s ,相似度為 %d " % (title,sim)) res = json.load( open(os.path.join("data/processed/reply/", str(int(index / 1000)) + '.json'), 'r', encoding='utf-8')) targetId = index % 1000 #randomId = random.randrange(0,len(res[targetId])) evaluator = Evaluator() candiates = evaluator.getBestResponse(responses=res[targetId], topk=100000, debugMode=False) # print("以下是相似度前 5 高的回應") # for candiate in candiates: # print("### %s\t%s %f" % (query, candiate[0],candiate[1])) # print("### index= ### %d" % (index)) if len(candiates): for candiate in candiates: print("%s\n%s\n===" % (query, candiate[0])) else: print("%s\n%s\n===" % (query, "找不到資料")) index += 1
base_path = './' input_path = 'crawler_save/' data_path = base_path + input_path ## output file name & path output_file_name = 'QA_file' output_file = './' + output_file_name + '.txt' ## load all json path json_file_list = [] for file in os.listdir(data_path): if file.endswith(".json"): json_file_path = data_path + file json_file_list.append(json_file_path) evaluator = Evaluator() i = 0 total_len = len(json_file_list) with open(output_file, 'w', encoding='utf-8') as output: for json_file in json_file_list: i = i +1 print('Now deal: ', json_file, ' (', i, '/', total_len, ')') with open(json_file, 'r', encoding='utf-8') as data: json_dict = json.load(data) for article in json_dict: temp_title = ''