Пример #1
0
class Chatbot_port2(object):
    def __init__(self):
        # 分词工具,基于jieba分词,并去除停用词
        seg = Seg()
        self.ss = SentenceSimilarity(seg)
        self.ss.restore_model()
        with open("dataset/answer.txt", 'r', encoding='utf-8') as file_answer:
            self.line = file_answer.readlines()

    def chat(self, question):
        question = question.strip()
        top_10 = self.ss.similarity(question)

        answer_index = top_10[0][0]
        answer = self.line[answer_index]
        return answer, top_10[0][1]
Пример #2
0
def dictTest():
    dict = {}
    seg = Seg()
    original_ss = SentenceSimilarity(seg)
    readDictData(original_ss, dict)
    original_ss.TfidfModel()
    #     original_ss.LdaModel()
    #     original_ss.LsiModel()
    total_data_len = len(X_test)
    success_len = 0
    f1 = open('ah_data_lsi.txt', 'w', encoding='utf-8')
    for i in range(len(X_test)):
        print("-------------------------------------")
        text = checkData(X_test[i])
        text = "".join(seg.cut_for_search(text))
        print("测试内容: " + text)

        try:
            sentences = original_ss.similarityArray(text)
            sentences = sorted(sentences,
                               key=lambda e: e.get_score(),
                               reverse=True)
            count = 0
            for sentence in sentences:
                if sentence.get_score() > 0.9:
                    print(sentence.get_score())

                if sentence.get_score() == 1.0:
                    count = count + 1

            sentence = original_ss.similarity(text)
            if count < 2 and dict.get(
                    sentence.get_origin_sentence()) == Y_test[i]:
                success_len = success_len + 1
            else:
                y = Y_test[i]
                f1.writelines("-------------------------------------\n")
                f1.writelines("测试内容: " + text + "\n")
                for sentence in sentences:
                    f1.writelines("匹配标签: 【" +
                                  dict.get(sentence.get_origin_sentence()) +
                                  "】 真实标签:【" + y + "】 评分: " +
                                  str(sentence.get_score()) + "\n")
        except Exception as e:
            print(e)
    print(success_len / total_data_len)
Пример #3
0
def tf():
    dt = {}
    # if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"train_data.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集
    file_obj = FileObj(r"test_data.txt")
    test1_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()  # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
    # w=open("result510tf.txt",'w')
    # w.write(str("source_id") + '\t' + str("target_id") + '\n')
    for i in range(len(test1_sentences)):
        print "*********************"
        print i
        print test1_sentences[i]
        test = str(test1_sentences[i].encode("utf-8"))
        t = test.split(',')[0]
        dict = ss.similarity(test1_sentences[i])
        # dict的key为句子的(序号-1),value为计算出的距离
        for k, v in dict:
            print t, k + 1, v  # 如2784 2784 1.0
            ind2 = k + 1
            if (str(k + 1) == str(t)):
                print "same"
            else:
                # w.write(str(t) + '\t' + str(k+1) + '\n')
                addtodict2(dt, int(t), int(ind2), v)
    # w.close()
    return dt
Пример #4
0
def run_prediction(input_file_path, output_file_path):
    # 读入训练集
    file_obj = FileObj(r"./TFIDF_baseline/dataSet/trainQuestions.txt")  
    train_sentences = file_obj.read_lines()
   

    # 读入测试集
    file_obj = FileObj(input_file_path)   
    test_sentences = file_obj.read_lines()


    # 分词工具,基于jieba分词,并去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型

    # 测试集
    right_count = 0
    
    file_result=open(output_file_path,'w')
    with open("./TFIDF_baseline/dataSet/trainAnswers.txt",'r',encoding = 'utf-8') as file_answer:
        line = file_answer.readlines()
           
    for i in range(0,len(test_sentences)):
        top_15 = ss.similarity(test_sentences[i])
        
        '''
        for j in range(0,len(top_15)):
            answer_index=top_15[j][0]
            answer=line[answer_index]
            file_result.write(str(top_15[j][1])+'\t'+str(answer))
        file_result.write("\n")
        '''
        file_result.write(line[top_15[0][0]]+'\n')
        
    file_result.close() 
    file_answer.close()
Пример #5
0
    test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)      
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
    for i in range(0,len(train_sentences)):
        sentence = ss.similarity(test1_sentences[i])

        if i != sentence.id:
            print (str(i) + " wrong! score: " + str(sentence.score))
        else:
            right_count += 1
            print (str(i) + " right! score: " + str(sentence.score))

    print ("正确率为: " + str(float(right_count)/len(train_sentences)))

    # 测试集2
    # right_count = 0
    # for i in range(0,len(train_sentences)):
    #     sentence = ss.similarity(test2_sentences[i])
    #
    #     if i != sentence.id:
Пример #6
0
    file_obj = FileObj(r"dataSet/devQuestions.txt")
    test_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,并去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()  # tfidf模型

    # 测试集
    right_count = 0

    file_result = open('dataSet/result.txt', 'w')
    with open("dataSet/trainAnswers.txt", 'r',
              encoding='utf-8') as file_answer:
        line = file_answer.readlines()

    for i in range(0, len(test_sentences)):
        top_15 = ss.similarity(test_sentences[i])

        for j in range(0, len(top_15)):
            answer_index = top_15[j][0]
            answer = line[answer_index]
            file_result.write(str(top_15[j][1]) + '\t' + str(answer))
        file_result.write("\n")

    file_result.close()
    file_answer.close()
Пример #7
0
    test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
    for i in range(0,len(train_sentences)):
        sentence = ss.similarity(test1_sentences[i])

        if i != sentence.id:
            print str(i) + " wrong! score: " + str(sentence.score)
        else:
            right_count += 1
            print str(i) + " right! score: " + str(sentence.score)

    print "正确率为: " + str(float(right_count)/len(train_sentences))

    # 测试集2
    # right_count = 0
    # for i in range(0,len(train_sentences)):
    #     sentence = ss.similarity(test2_sentences[i])
    #
    #     if i != sentence.id:
Пример #8
0
    file_obj = FileObj(r"testSet/zhenduanxx-utf.txt")
    test1_sentences = file_obj.read_lines()
    #test1_sentences = "子宫 肌瘤"

    # 分词工具,基于jieba分词,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    #ss.TfidfModel()         # tfidf模型
    #ss.LsiModel()         # lsi模型
    ss.LdaModel()  # lda模型
    #ss.W2Vmodel()

    for j in range(0, len(test1_sentences)):
        sentence = ss.similarity(test1_sentences[j], j)
'''    # 测试集1
    right_count = 0
    file = open("result6.txt", "a")
    for j in range(0,len(test1_sentences)):
        sentence = ss.similarity(test1_sentences[j])
        file.write(str(sentence.origin_sentence)+str(sentence.score)+"\n")
    file.flush()
    file.close()'''
'''            if i != sentence.id:
                print (str(i) + " wrong! score: " + str(sentence.score))
            else:
                right_count += 1
                print (str(i) + " right! score: " + str(sentence.score))'''