예제 #1
0
def matcherTesting(matcherType, removeStopWords=False):

    matcher = getMatcher(matcherType, removeStopWords)

    while True:
        query = input("隨便說些什麼吧: ")
        title, index = matcher.match(query)
        sim = matcher.getSimilarity()
        print("最為相似的標題是 %s ,相似度為 %d " % (title, sim))

        res = json.load(
            open(
                os.path.join("data/processed/reply/", str(int(index / 1000)) + ".json"),
                "r",
                encoding="utf-8",
            )
        )
        targetId = index % 1000
        # randomId = random.choice(res[targetId])

        evaluator = Evaluator()
        candiates = evaluator.getBestResponse(
            responses=res[targetId], topk=5, debugMode=False
        )
        print("以下是相似度前 5 高的回應")
        for candiate in candiates:
            print("%s %f" % (candiate[0], candiate[1]))
 def __init__(self):
     self.matcher = match.getMatcher("Fuzzy")
     self.evaluator = Evaluator()
     self.testSegment()
     self.defaultResponse = [
         "你在說什麼呢?",
         "我不太明白你的意思"
     ]
예제 #3
0
 def __init__(self,match_type="bm25"):
     self.matcher = match.getMatcher(match_type)
     self.evaluator = Evaluator()
     self.testSegment()
     self.defaultResponse = [
         "你在說什麼呢?",
         "我不太明白你的意思"
     ]
예제 #4
0
    def __init__(self, match_type="bm25"):
        self.matcher = match.getMatcher(match_type)
	self.evaluator = Evaluator()
	self.testSegment()
	self.defaultResonse = [
	    "?",
	    "小哥哥,小哥哥,你在说什么啊?",
	    "嗯"
	]
예제 #5
0
def matcherTesting(matcherType, removeStopWords=False):
    
    matcher = getMatcher(matcherType, removeStopWords)
    while True:
	query = input("随便聊聊?: ")
	title, index = matcher.match(query)
	sim = matcher.getSimilarity()
	print("最为相似的标题是 %s, 相似度为 %d " % (title, sim))

	res = json.load(open(os.path.join("../knowledge_base/processed/reply/", str(int(index/1000))+'.json'), 'r', encoding='utf-8'))
	targetId = index % 1000
	
	evaluator = Evaluator()
	candidates = evaluator.getBestResponse(responses=res[targetId], topk=5, debugMode=False)
	print("以下是相似度前5的回复:") 
	for candidate in candidates:
	    print("%s %f" % (candidate[0], candidate[1]))
예제 #6
0
class GossipBot(object):
    """
    八卦板聊天機器人 ob'_'ov
    """
    def __init__(self, match_type="bm25"):
        self.matcher = match.getMatcher(match_type)
        self.evaluator = Evaluator()
        self.testSegment()
        self.defaultResponse = ["你在說什麼呢?", "我不太明白你的意思"]

    def testSegment(self):
        logging.info("測試斷詞模塊中")
        try:
            self.matcher.wordSegmentation("測試一下斷詞")
            logging.info("測試成功")
        except Exception as e:
            logging.info(repr(e))
            logging.info("模塊載入失敗,請確認data與字典齊全")

    def chatTime(self):
        print("MianBot: 您好,我是你的老朋友眠寶,讓我們來聊聊八卦吧 o_o ")
        while True:
            query = input("User: "******"MianBot: " + self.getResponse(query))

    def getResponse(self, query, threshold=50):

        title, index = self.matcher.match(query)
        sim = self.matcher.getSimilarity()
        if sim < threshold:
            return self.defaultResponse[random.randrange(
                0, len(self.defaultResponse))]
        else:
            res = json.load(
                open(os.path.join("data/processed/reply/",
                                  str(int(index / 1000)) + '.json'),
                     'r',
                     encoding='utf-8'))
            targetId = index % 1000
            candiates = self.evaluator.getBestResponse(res[targetId], topk=3)
            reply = self.randomPick(candiates)
            return reply

    def randomPick(self, answers):
        try:
            answer = answers[random.randrange(0, len(answers))][0]
        except:
            answer = "沒有資料"
        return answer

    def randomTalks(self, num=100):
        with open("data/Titles.txt", 'r', encoding='utf-8') as data:
            titles = [line.strip('\n') for line in data]
        for i in range(0, num):
            query = titles[random.randrange(0, len(titles))]
            print("User: "******"MianBot: " + self.getResponse(query) + "\n")
예제 #7
0
class GossipBot(object):

    """
    八卦板聊天機器人 ob'_'ov
    """
    def __init__(self):
        self.matcher = match.getMatcher("Fuzzy")
        self.evaluator = Evaluator()
        self.testSegment()
        self.defaultResponse = [
            "你在說什麼呢?",
            "我不太明白你的意思"
        ]

    def testSegment(self):
        logging.info("測試斷詞模塊中")
        try:
            self.matcher.wordSegmentation("測試一下斷詞")
            logging.info("測試成功")
        except Exception as e:
            logging.info(repr(e))
            logging.info("模塊載入失敗,請確認data與字典齊全")

    def chatTime(self):
        print("MianBot: 您好,我是你的老朋友眠寶,讓我們來聊聊八卦吧 o_o ")
        while True:
            query = input("User: "******"MianBot: " +self.getResponse(query))

    def getResponse(self,query,threshold=50):

        title,index = self.matcher.match(query)
        sim = self.matcher.getSimilarity()
        if sim < threshold:
            return self.defaultResponse[random.randrange(0,len(self.defaultResponse))]
        else:
            res = json.load(open(os.path.join("data/processed/reply/",str(int(index/1000))+'.json'),'r',encoding='utf-8'))
            targetId = index % 1000
            candiates = self.evaluator.getBestResponse(res[targetId],topk=3)
            reply = self.randomPick(candiates)
            return reply

    def randomPick(self, answers):
        try:
            answer = answers[random.randrange(0,len(answers))][0]
        except:
            answer = "沒有資料"
        return answer

    def randomTalks(self, num=100):
        with open("data/Titles.txt",'r',encoding='utf-8') as data:
            titles = [line.strip('\n') for line in data]
        for i in range(0,num):
            query = titles[random.randrange(0,len(titles))]
            print("User: "******"MianBot: " +self.getResponse(query) + "\n")
예제 #8
0
class chatBot(object):
    '''
    momo chat
    '''
    def __init__(self, match_type="bm25"):
        self.matcher = match.getMatcher(match_type)
	self.evaluator = Evaluator()
	self.testSegment()
	self.defaultResonse = [
	    "?",
	    "小哥哥,小哥哥,你在说什么啊?",
	    "嗯"
	]

    def testSegment(self):
	logging.info("测试断词模块")
	try:
	    self.matcher.wordSegmentation("测试断词")
	    logging.info("测试成功")
	except Exception as e:
	    logging.info(repr(e))
	    logging.info("模块载入失败,请确认字典齐全")

    def chatTime(self):
	print("幂酱:废话少说,有钱有车有房嘛?")
	while True:
	    query = input("User: "******"幂酱: " + self.getResponse(query))

    def getResponse(self, query, threshold=50):
	title, idx = self.matcher.match(query)
	sim = self.matcher.getSimilarity()
	if sim < threshold:
	    return self.defaultReponse[random.randrange(0, len(self.defaultResponse))]
	else:
	    res = json.load(open(os.path.join("data/processed/reply/", str(int(index/1000)) + '.json'),'r',encoding='utf-8'))
	    targetId = idx % 1000
	    candidates = self.evaluator.getBestResponse(res[targetId],topk=3)
	    reply = self.randomPick(candidates)
	    return reply

    def randomPick(self, answers):
	try:
	    answer = answers[random.randrange(0,len(answers))][0]
	except:
	    answer = "404 Not Found"
	return answer

    def randomTalks(self, num=100):
	with open("data/Titles.txt", 'r', encoding='utf-8') as data:
	    titles = [line.strip('\n') for line in data]
	for i range(0, num):
	    query = titles[random.randrange(0, len(titles))]
	    print("User: "******"幂酱:" + self.getResponse(query) + "\n")
예제 #9
0
def corpusGenerator(matcherType, removeStopWords=False):

    matcher = getMatcher(matcherType, removeStopWords)

    index = 0
    with open("data/Titles.txt", 'r', encoding='utf-8') as data:
        for line in data:
            #        query = input("隨便說些什麼吧: ")
            query = line.strip('\n')
            #            title,index = matcher.match(query)
            #            sim = matcher.getSimilarity()
            #            print("最為相似的標題是 %s ,相似度為 %d " % (title,sim))

            res = json.load(
                open(os.path.join("data/processed/reply/",
                                  str(int(index / 1000)) + '.json'),
                     'r',
                     encoding='utf-8'))
            targetId = index % 1000
            #randomId = random.randrange(0,len(res[targetId]))

            evaluator = Evaluator()
            candiates = evaluator.getBestResponse(responses=res[targetId],
                                                  topk=100000,
                                                  debugMode=False)
            #            print("以下是相似度前 5 高的回應")
            #            for candiate in candiates:
            #                print("### %s\t%s %f" % (query, candiate[0],candiate[1]))
            #            print("### index= ### %d" % (index))
            if len(candiates):
                for candiate in candiates:
                    print("%s\n%s\n===" % (query, candiate[0]))
            else:
                print("%s\n%s\n===" % (query, "找不到資料"))

            index += 1
예제 #10
0
base_path = './'
input_path = 'crawler_save/'
data_path = base_path + input_path   

## output file name & path
output_file_name = 'QA_file'
output_file = './' + output_file_name + '.txt'

## load all json path
json_file_list = []
for file in os.listdir(data_path):
    if file.endswith(".json"):
        json_file_path = data_path + file
        json_file_list.append(json_file_path)

evaluator = Evaluator()

i = 0
total_len = len(json_file_list)

with open(output_file, 'w', encoding='utf-8') as output:
    for json_file in json_file_list:
        i = i +1
        print('Now deal: ', json_file, '  (', i, '/', total_len, ')')

        with open(json_file, 'r', encoding='utf-8') as data:
            json_dict = json.load(data)

            for article in json_dict:
                temp_title = ''