예제 #1
0
 def __init__(self):
     # 分词工具,基于jieba分词,并去除停用词
     seg = Seg()
     self.ss = SentenceSimilarity(seg)
     self.ss.restore_model()
     with open("dataset/answer.txt", 'r', encoding='utf-8') as file_answer:
         self.line = file_answer.readlines()
예제 #2
0
class Chatbot_port2(object):
    def __init__(self):
        # 分词工具,基于jieba分词,并去除停用词
        seg = Seg()
        self.ss = SentenceSimilarity(seg)
        self.ss.restore_model()
        with open("dataset/answer.txt", 'r', encoding='utf-8') as file_answer:
            self.line = file_answer.readlines()

    def chat(self, question):
        question = question.strip()
        top_10 = self.ss.similarity(question)

        answer_index = top_10[0][0]
        answer = self.line[answer_index]
        return answer, top_10[0][1]
예제 #3
0
def dictTest():
    dict = {}
    seg = Seg()
    original_ss = SentenceSimilarity(seg)
    readDictData(original_ss, dict)
    original_ss.TfidfModel()
    #     original_ss.LdaModel()
    #     original_ss.LsiModel()
    total_data_len = len(X_test)
    success_len = 0
    f1 = open('ah_data_lsi.txt', 'w', encoding='utf-8')
    for i in range(len(X_test)):
        print("-------------------------------------")
        text = checkData(X_test[i])
        text = "".join(seg.cut_for_search(text))
        print("测试内容: " + text)

        try:
            sentences = original_ss.similarityArray(text)
            sentences = sorted(sentences,
                               key=lambda e: e.get_score(),
                               reverse=True)
            count = 0
            for sentence in sentences:
                if sentence.get_score() > 0.9:
                    print(sentence.get_score())

                if sentence.get_score() == 1.0:
                    count = count + 1

            sentence = original_ss.similarity(text)
            if count < 2 and dict.get(
                    sentence.get_origin_sentence()) == Y_test[i]:
                success_len = success_len + 1
            else:
                y = Y_test[i]
                f1.writelines("-------------------------------------\n")
                f1.writelines("测试内容: " + text + "\n")
                for sentence in sentences:
                    f1.writelines("匹配标签: 【" +
                                  dict.get(sentence.get_origin_sentence()) +
                                  "】 真实标签:【" + y + "】 评分: " +
                                  str(sentence.get_score()) + "\n")
        except Exception as e:
            print(e)
    print(success_len / total_data_len)
예제 #4
0
    def printInfo(event):
        seg = Seg()
        seg.load_userdict('../userdict/userdict.txt')
        # 读取数据
        List_kw, questionList, answerList = read_corpus1()
        # 初始化模型
        ss = SentenceSimilarity(seg)
        ss.set_sentences(questionList)
        ss.TfidfModel()  # tfidf模型
        # ss.LsiModel()         # lsi模型
        # ss.LdaModel()         # lda模型
        text2.delete(1.0, END)
        question = (text1.get('1.0', END))

        #if question == 'q':
        #break
        time1 = time.time()
        question_k = ss.similarity_k(question, 5)
        text2.insert("insert", ": {}".format(answerList[question_k[0][0]]))
        #print(": {}".format(answerList[question_k[0][0]]))
        #for idx, score in zip(*question_k):
        # print("same questions: {},                score: {}".format(questionList[idx], score))
        #time2 = time.time()
        #cost = time2 - time1
        #print('Time cost: {} s'.format(cost))
        #entry2.insert(10,question)
        #清空entry2控件
        text1.delete(1.0, END)
        syn(": {}".format(answerList[question_k[0][0]]))
예제 #5
0
class kuakuaChat():
    def __init__(self):
        """
        初始化夸夸话题回复表
        """
        self.qa_dict = {}
        self.q_list = []
        with open('./douban_kuakua_topic.txt', 'r',
                  encoding='utf8') as in_file:
            for line in in_file.readlines():
                que = line.split('<######>')[0].strip()
                ans_list = []
                for ans in line.split('<######>')[-1].split('<$$$$$$>'):
                    if len(ans) > 2:
                        ans_list.append(ans)

                if len(que) > 5:
                    self.q_list.append(que)
                    self.qa_dict[que] = ans_list

        zhcn_seg = zhcnSeg()
        self.sent_sim = SentenceSimilarity(zhcn_seg)
        self.sent_sim.set_sentences(self.q_list)
        # 默认用tfidf
        self.sent_sim.TfidfModel()

    def answer_question(self, question_str):
        """
        返回与输入问句最相似的问句的固定回答
        :param question_str:
        :return:
        """
        most_sim_questions = self.sent_sim.similarity_top_k(question_str, 4)
        answer_list = []
        for item in most_sim_questions:
            answer = self.qa_dict[item[0]]
            answer_list += answer
        return answer_list
예제 #6
0
    def __init__(self):
        """
        初始化夸夸话题回复表
        """
        self.qa_dict = {}
        self.q_list = []
        with open('./douban_kuakua_topic.txt', 'r',
                  encoding='utf8') as in_file:
            for line in in_file.readlines():
                que = line.split('<######>')[0].strip()
                ans_list = []
                for ans in line.split('<######>')[-1].split('<$$$$$$>'):
                    if len(ans) > 2:
                        ans_list.append(ans)

                if len(que) > 5:
                    self.q_list.append(que)
                    self.qa_dict[que] = ans_list

        zhcn_seg = zhcnSeg()
        self.sent_sim = SentenceSimilarity(zhcn_seg)
        self.sent_sim.set_sentences(self.q_list)
        # 默认用tfidf
        self.sent_sim.TfidfModel()
예제 #7
0
def tf():
    dt = {}
    # if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"train_data.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集
    file_obj = FileObj(r"test_data.txt")
    test1_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()  # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
    # w=open("result510tf.txt",'w')
    # w.write(str("source_id") + '\t' + str("target_id") + '\n')
    for i in range(len(test1_sentences)):
        print "*********************"
        print i
        print test1_sentences[i]
        test = str(test1_sentences[i].encode("utf-8"))
        t = test.split(',')[0]
        dict = ss.similarity(test1_sentences[i])
        # dict的key为句子的(序号-1),value为计算出的距离
        for k, v in dict:
            print t, k + 1, v  # 如2784 2784 1.0
            ind2 = k + 1
            if (str(k + 1) == str(t)):
                print "same"
            else:
                # w.write(str(t) + '\t' + str(k+1) + '\n')
                addtodict2(dt, int(t), int(ind2), v)
    # w.close()
    return dt
예제 #8
0
def run_prediction(input_file_path, output_file_path):
    # 读入训练集
    file_obj = FileObj(r"./TFIDF_baseline/dataSet/trainQuestions.txt")  
    train_sentences = file_obj.read_lines()
   

    # 读入测试集
    file_obj = FileObj(input_file_path)   
    test_sentences = file_obj.read_lines()


    # 分词工具,基于jieba分词,并去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型

    # 测试集
    right_count = 0
    
    file_result=open(output_file_path,'w')
    with open("./TFIDF_baseline/dataSet/trainAnswers.txt",'r',encoding = 'utf-8') as file_answer:
        line = file_answer.readlines()
           
    for i in range(0,len(test_sentences)):
        top_15 = ss.similarity(test_sentences[i])
        
        '''
        for j in range(0,len(top_15)):
            answer_index=top_15[j][0]
            answer=line[answer_index]
            file_result.write(str(top_15[j][1])+'\t'+str(answer))
        file_result.write("\n")
        '''
        file_result.write(line[top_15[0][0]]+'\n')
        
    file_result.close() 
    file_answer.close()
예제 #9
0
def main(question, top_k, task='faq'):
    # 读取数据
    if task == 'chat':
        qList_kw, questionList, answerList = read_corpus2()
    else:
        qList_kw, questionList, answerList = read_corpus1()
    """简单的倒排索引"""
    # 计算倒排表
    invertTable = invert_idxTable(qList_kw)
    inputQuestionKW = seg.cut(question)

    # 利用关键词匹配得到与原来相似的问题集合
    questionList_s, answerList_s = filter_questionByInvertTab(
        inputQuestionKW, questionList, answerList, invertTable)
    # 初始化模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(questionList_s)
    ss.TfidfModel()  # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型
    question_k = ss.similarity_k(question, top_k)
    return question_k, questionList_s, answerList_s
예제 #10
0
from cutWords import Seg
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence
import time
from time import ctime
import threading
file_obj = FileObj(r"dataSet/train_q.txt")
train_sentences = file_obj.read_lines()
with open("dataSet/train_a.txt", 'r', encoding='utf-8') as file_answer:
    line = file_answer.readlines()

seg = Seg()

# 训练模型
ss1 = SentenceSimilarity(seg)
ss1.set_sentences(train_sentences)
ss1.TfidfModel()  # tfidf模型

ss2 = SentenceSimilarity(seg)
ss2.set_sentences(train_sentences)
ss2.LsiModel()  # LSI模型


def tfidf_model(sentence):
    top = ss1.similarity(sentence)
    answer_index = top[0][0]
    answer = line[answer_index]
    return top[0][1], answer

예제 #11
0
파일: runModel.py 프로젝트: xzwj/JDDC
from tqdm import tqdm

if __name__ == '__main__':
    # 读入训练集
    # file_obj = FileObj(r"dataSet/trainQuestions.txt")
    # train_sentences = file_obj.read_lines()

    # 读入测试集
    file_obj = FileObj(r"dataSet/devQuestions.txt")
    test_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,并去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(test_sentences)
    ss.TfidfModel()  # tfidf模型

    # 测试集
    right_count = 0

    file_result = open('dataSet/result.txt', 'w')
    with open("dataSet/trainAnswers.txt", 'r',
              encoding='utf-8') as file_answer:
        line = file_answer.readlines()

    for i in tqdm(range(0, len(test_sentences))):
        top_15 = ss.similarity(test_sentences[i])

        for j in range(0, len(top_15)):
예제 #12
0
def plot_words(wordList):
    fDist = FreqDist(wordList)
    #print(fDist.most_common())
    print("单词总数: ",fDist.N())
    print("不同单词数: ",fDist.B())
    fDist.plot(10)


if __name__ == '__main__':
    # 设置外部词
    seg = Seg()
    seg.load_userdict('./userdict/userdict.txt')
    # 读取数据
    List_kw, questionList, answerList = read_corpus()
    # 初始化模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(questionList)
    ss.TfidfModel()         # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    while True:
        question = input("请输入问题(q退出): ")
        if question == 'q':
            break
        time1 = time.time()
        question_k = ss.similarity_k(question, 5)
        print("亲,我们给您找到的答案是: {}".format(answerList[question_k[0][0]]))
        for idx, score in zip(*question_k):
            print("same questions: {},                score: {}".format(questionList[idx], score))
        time2 = time.time()
예제 #13
0
    answer_list = [x[1] for x in list_qa_dataset]

    numQuestions_train = len(question_list) - 100
    numQuestions_test = 100

    question_list_train = question_list[:numQuestions_train]
    question_list_test = question_list[numQuestions_train:]

    answer_list_train = answer_list[:numQuestions_train]
    answer_list_test = answer_list[numQuestions_train:]

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(question_list_train)
    # ss.TfidfModel()         # tfidf模型
    ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
    for i in range(0,len(question_list_test)):
        sentenceK = ss.similarityK(question_list_test[i])
        print ' '
        print 'question: %s' % question_list_test[i]
        for k in range(len(sentenceK)):
            sentence_k = sentenceK[k]
            org_sentence = sentence_k.origin_sentence
            sentence_id = sentence_k.id
예제 #14
0
    file_obj = FileObj(r"testSet/data2")
    train_sentences = file_obj.read_lines()

    # 读入测试集1
    file_obj = FileObj(r"testSet/testSet3")
    test1_sentences = file_obj.read_lines()

    # 读入测试集2
    #file_obj = FileObj(r"testSet/testSet2.txt")
    #test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    #ss.TfidfModel()         # tfidf模型
    #ss.LsiModel()         # lsi模型
    #ss.LdaModel()         # lda模型
    ss.FasttxModel()

    # 测试集1
    right_count = 0
    for i in range(0, len(train_sentences)):
        print(test1_sentences[i])
        ss.similarity2(test1_sentences[i])
        print("\r\n")

    # 测试集2
    # right_count = 0
예제 #15
0
    file_obj = FileObj(r"testSet/trainSet.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集1
    file_obj = FileObj(r"testSet/testSet1.txt")
    test1_sentences = file_obj.read_lines()

    # 读入测试集2
    file_obj = FileObj(r"testSet/testSet2.txt")
    test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
    for i in range(0,len(train_sentences)):
        sentence = ss.similarity(test1_sentences[i])

        if i != sentence.id:
            print str(i) + " wrong! score: " + str(sentence.score)
        else:
            right_count += 1
            print str(i) + " right! score: " + str(sentence.score)
def plot_words(wordList):
    fDist = FreqDist(wordList)
    # print(fDist.most_common())
    print("单词总数: ", fDist.N())
    print("不同单词数: ", fDist.B())
    fDist.plot(10)


if __name__ == '__main__':
    # 设置外部词
    seg = Seg()
    seg.load_userdict('userdict/userdict.txt')  # 添加自己的词库到默认词库中
    # 读取数据
    _, questionList, answerList = read_corpus()
    # 初始化模型
    ss = SentenceSimilarity(seg)  # 设置self.reg属性
    ss.set_sentences(questionList)  # 设置self.sentences属性,列表类型,列表中每个值为Sentence对象
    ss.TfidfModel()  # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    while True:
        question = input("请输入问题(q退出): ")
        if question == 'q':
            break
        time1 = time.time()
        question_k = ss.similarity_k(question, 5)
        print("亲,我们给您找到的答案是: {}".format(answerList[question_k[0][0]]))
        for idx, score in zip(*question_k):
            print("same questions: {},                score: {}".format(
                questionList[idx], score))
예제 #17
0
    file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/trainSet.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集1
    file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/testSet1.txt")
    test1_sentences = file_obj.read_lines()

    # 读入测试集2
    file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/testSet2.txt")
    test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)      
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
    for i in range(0,len(train_sentences)):
        sentence = ss.similarity(test1_sentences[i])

        if i != sentence.id:
            print (str(i) + " wrong! score: " + str(sentence.score))
        else:
            right_count += 1
            print (str(i) + " right! score: " + str(sentence.score))
예제 #18
0
#对 important_sentence中的句子,和问题计算相似度

#读入训练集
#TODO: 
#现在的训练集是原来的wiki内容 + 问题 ,后面改一下训练集?
train_sentence = list_sentence
train_sentence.append(question)   
#构造测试集
test_sentence = important_sentence

# 分词工具,基于jieba分词加了一次封装,主要是去除停用词
seg = Seg()

# 训练模型
ss = SentenceSimilarity(seg)
ss.set_sentences(train_sentence)
ss.TfidfModel()         # tfidf模型
#ss.LsiModel()         # lsi模型
#ss.LdaModel()         # lda模型

# 测试与问题的相似度
score_sentence = []   
for i in range(0,len(test_sentence)):
    score = ss.MYsimilarity(question, test_sentence[i])   
    score_sentence.append(score)    

new_score = ss.MYsimilarity2(question, test_sentence)

#输出到文件
WriteFile("sentence.txt", important_sentence)
예제 #19
0
from sentence import Sentence

if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"testSet/tjmsnew.txt")
    train_sentences = file_obj.read_lines()

    file_obj = FileObj(r"testSet/zhenduanxx-utf.txt")
    test1_sentences = file_obj.read_lines()
    #test1_sentences = "子宫 肌瘤"

    # 分词工具,基于jieba分词,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    #ss.TfidfModel()         # tfidf模型
    #ss.LsiModel()         # lsi模型
    ss.LdaModel()  # lda模型
    #ss.W2Vmodel()

    for j in range(0, len(test1_sentences)):
        sentence = ss.similarity(test1_sentences[j], j)
'''    # 测试集1
    right_count = 0
    file = open("result6.txt", "a")
    for j in range(0,len(test1_sentences)):
        sentence = ss.similarity(test1_sentences[j])
        file.write(str(sentence.origin_sentence)+str(sentence.score)+"\n")
    file.flush()
예제 #20
0
if __name__ == '__main__':
    start = clock()

    # 读入后半部分语料
    file_obj = FileObj(r"sentence2.txt")
    train_sentences = file_obj.read_lines()

    # 读入前半部分语料
    file_obj = FileObj(r"sentence1.txt")
    test1_sentences = file_obj.read_lines()
    # 分词工具,基于jieba分词,加了一次封装,主要是去除停用词
    seg = Seg()

    # 生成模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    #ss.TfidfModel()         # tfidf模型
    ss.LsiModel()  # lsi模型
    #ss.LdaModel()         # lda模型

    # 计算句子相似度
    # for i in range(0,len(train_sentences)/100):
    # mysims = ss.mysimilarity(test1_sentences[i*100])
    # # 每一百行为一个整体
    # sims_divided = mysims[i*100:(i+1)*100]
    # # 对一百行内的相似度进行排序
    # sort_sims = sorted(enumerate(sims_divided),key = lambda item : -item[1])
    # # 选择前五个最高的相似度进行输出
    # chosen_sims = sort_sims[:5]
    # for j in range(0,5):
예제 #21
0
        train_sentences = rf.readlines()

    # 读入测试集
    with open('dataset/test_input.txt','r',encoding='utf-8') as rf:
        raw_test_sentences = rf.readlines()
    test_sentences = []
    for sen in raw_test_sentences:
        test_sentences.append(sen.strip())

    for sen in test_sentences:
        print(sen)
    # 分词工具,基于jieba分词,并去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    if train:
        ss.set_sentences(train_sentences)
        ss.TfidfModel()  # tfidf模型
        ss.save_model()
    else:
        ss.restore_model()

    # 测试集
    right_count = 0
    print(os.getcwd())
    file_result = open('dataset/test_output.txt', 'w',encoding='utf-8')

    with open("dataset/answer.txt", 'r', encoding='utf-8') as file_answer:
        line = file_answer.readlines()