예제 #1
0
def tf():
    dt = {}
    # if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"train_data.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集
    file_obj = FileObj(r"test_data.txt")
    test1_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()  # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
    # w=open("result510tf.txt",'w')
    # w.write(str("source_id") + '\t' + str("target_id") + '\n')
    for i in range(len(test1_sentences)):
        print "*********************"
        print i
        print test1_sentences[i]
        test = str(test1_sentences[i].encode("utf-8"))
        t = test.split(',')[0]
        dict = ss.similarity(test1_sentences[i])
        # dict的key为句子的(序号-1),value为计算出的距离
        for k, v in dict:
            print t, k + 1, v  # 如2784 2784 1.0
            ind2 = k + 1
            if (str(k + 1) == str(t)):
                print "same"
            else:
                # w.write(str(t) + '\t' + str(k+1) + '\n')
                addtodict2(dt, int(t), int(ind2), v)
    # w.close()
    return dt
예제 #2
0
def run_prediction(input_file_path, output_file_path):
    # 读入训练集
    file_obj = FileObj(r"./TFIDF_baseline/dataSet/trainQuestions.txt")  
    train_sentences = file_obj.read_lines()
   

    # 读入测试集
    file_obj = FileObj(input_file_path)   
    test_sentences = file_obj.read_lines()


    # 分词工具,基于jieba分词,并去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型

    # 测试集
    right_count = 0
    
    file_result=open(output_file_path,'w')
    with open("./TFIDF_baseline/dataSet/trainAnswers.txt",'r',encoding = 'utf-8') as file_answer:
        line = file_answer.readlines()
           
    for i in range(0,len(test_sentences)):
        top_15 = ss.similarity(test_sentences[i])
        
        '''
        for j in range(0,len(top_15)):
            answer_index=top_15[j][0]
            answer=line[answer_index]
            file_result.write(str(top_15[j][1])+'\t'+str(answer))
        file_result.write("\n")
        '''
        file_result.write(line[top_15[0][0]]+'\n')
        
    file_result.close() 
    file_answer.close()
예제 #3
0
# encoding=utf-8

from Segment import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence
from time import clock

if __name__ == '__main__':
    start = clock()

    # 读入后半部分语料
    file_obj = FileObj(r"sentence2.txt")
    train_sentences = file_obj.read_lines()

    # 读入前半部分语料
    file_obj = FileObj(r"sentence1.txt")
    test1_sentences = file_obj.read_lines()
    # 分词工具,基于jieba分词,加了一次封装,主要是去除停用词
    seg = Seg()

    # 生成模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    #ss.TfidfModel()         # tfidf模型
    ss.LsiModel()  # lsi模型
    #ss.LdaModel()         # lda模型

    # 计算句子相似度
    # for i in range(0,len(train_sentences)/100):
    # mysims = ss.mysimilarity(test1_sentences[i*100])
예제 #4
0
#encoding=utf-8

from fileObject import FileObj,Seg
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence

if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/trainSet.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集1
    file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/testSet1.txt")
    test1_sentences = file_obj.read_lines()

    # 读入测试集2
    file_obj = FileObj(r"D:/Github Project/sentence Similarity/testSet/testSet2.txt")
    test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)      
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
예제 #5
0
#encoding=utf-8

from zhcnSegment import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence

if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"testSet/trainSet.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集1
    file_obj = FileObj(r"testSet/testSet1.txt")
    test1_sentences = file_obj.read_lines()

    # 读入测试集2
    file_obj = FileObj(r"testSet/testSet2.txt")
    test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()  # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
예제 #6
0
#encoding=utf-8

from cutWords import Seg
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence
import time
from time import ctime
import threading
file_obj = FileObj(r"dataSet/train_q.txt")
train_sentences = file_obj.read_lines()
with open("dataSet/train_a.txt", 'r', encoding='utf-8') as file_answer:
    line = file_answer.readlines()

seg = Seg()

# 训练模型
ss1 = SentenceSimilarity(seg)
ss1.set_sentences(train_sentences)
ss1.TfidfModel()  # tfidf模型

ss2 = SentenceSimilarity(seg)
ss2.set_sentences(train_sentences)
ss2.LsiModel()  # LSI模型


def tfidf_model(sentence):
    top = ss1.similarity(sentence)
    answer_index = top[0][0]
    answer = line[answer_index]
    return top[0][1], answer
예제 #7
0
            if it[0] not in uselessTag:
                if not useStopWord:
                    word_list.append(tagdict['word'][index])
                elif tagdict['word'][index] not in self.stopwords:
                    word_list.append(tagdict['word'][index])
        return word_list

    def cut(self,sentences):
        """
        分词
        :param sentences:需要分词的语料集
        :return: 去噪后的单词list
        """
        tags=self.get_tags(sentences)
        cutedSentences=[]
        for sentence in tags:
            cutedSentences.append(self.denoisingOne(sentence))
        return cutedSentences

    def depenPars(self,sentences):
        return self.nlp.depparser(sentences)


if __name__=="__main__":
    from fileObject import FileObj
    Fobj=FileObj(r"testSet/trainSet.txt")

    scentences1 = Fobj.read_lines()
    cutTool=CNSegment()
    lst=cutTool.depenPars(scentences1[0])
    print json.dumps(lst,encoding="UTF-8", ensure_ascii=False)
예제 #8
0
from collections import deque
import random
import string
from zhon.hanzi import punctuation
import math

NO_PUNCTUATION = False # 去掉每句话首尾的标点
RANDRANGE = 1
MORE_HAPPY = True
BAD_EMO = ['死', '亡']
WINDOW_SIZE = 2


if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"dataSet/train.txt")  
    train_sentences = file_obj.read_lines()
    train_sentences_len = len(train_sentences)
   

    # 读入测试集
    file_obj = FileObj(r"dataSet/test_keywords.txt")   
    test_sentences = file_obj.read_lines()


    # 分词工具,基于jieba分词,并去除停用词
    seg = Seg()

#     # 训练模型
#     ss = SentenceSimilarity(seg)
#     ss.set_sentences(train_sentences)
예제 #9
0
#encoding=utf-8

from cutWords import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence

if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"dataSet/trainQuestions.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集
    file_obj = FileObj(r"dataSet/devQuestions.txt")
    test_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,并去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()  # tfidf模型

    # 测试集
    right_count = 0

    file_result = open('dataSet/result.txt', 'w')
    with open("dataSet/trainAnswers.txt", 'r',
              encoding='utf-8') as file_answer:
        line = file_answer.readlines()
예제 #10
0
#encoding=utf-8

from Segment import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence
from time import clock

if __name__ == '__main__':
    start = clock()

    for num in range(1, 81):

        # 读入后半部分语料
        file_obj = FileObj("sentence2_" + str(num) + ".txt")
        train_sentences = file_obj.read_lines()

        # 读入前半部分语料
        file_obj = FileObj("sentence1_" + str(num) + ".txt")
        test1_sentences = file_obj.read_lines()

        # 分词工具,基于jieba分词,加了一次封装,主要是去除停用词
        seg = Seg()

        # 生成模型
        ss = SentenceSimilarity(seg)
        ss.set_sentences(train_sentences)
        #ss.TfidfModel()         # tfidf模型
        ss.LsiModel()  # lsi模型
        #ss.LdaModel()         # lda模型
예제 #11
0
#encoding=utf-8

from zhcnSegment import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence

if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"testSet/result0.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集1
    file_obj = FileObj(r"testSet/test0.txt")
    test1_sentences = file_obj.read_lines()

    # 读入测试集2
    #file_obj = FileObj(r"testSet/testSet2.txt")
    #test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()  # tfidf模型
    #ss.LsiModel()         # lsi模型
    #ss.LdaModel()         # lda模型

    # 测试集1
예제 #12
0
#encoding=utf-8

from zhcnSegment import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence

if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"testSet/trainSet.txt")
    train_sentences = file_obj.read_lines()

    # 读入测试集1
    file_obj = FileObj(r"testSet/testSet1.txt")
    test1_sentences = file_obj.read_lines()

    # 读入测试集2
    file_obj = FileObj(r"testSet/testSet2.txt")
    test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
예제 #13
0
#encoding=utf-8

from zhcnSegment import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence

if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"testSet/data2")
    train_sentences = file_obj.read_lines()

    # 读入测试集1
    file_obj = FileObj(r"testSet/testSet3")
    test1_sentences = file_obj.read_lines()

    # 读入测试集2
    #file_obj = FileObj(r"testSet/testSet2.txt")
    #test2_sentences = file_obj.read_lines()

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    #ss.TfidfModel()         # tfidf模型
    #ss.LsiModel()         # lsi模型
    #ss.LdaModel()         # lda模型
    ss.FasttxModel()
예제 #14
0
#encoding=utf-8

from zhcnSegment import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence

if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"testSet/tjmsnew.txt")
    train_sentences = file_obj.read_lines()

    file_obj = FileObj(r"testSet/zhenduanxx-utf.txt")
    test1_sentences = file_obj.read_lines()
    #test1_sentences = "子宫 肌瘤"

    # 分词工具,基于jieba分词,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    #ss.TfidfModel()         # tfidf模型
    #ss.LsiModel()         # lsi模型
    ss.LdaModel()  # lda模型
    #ss.W2Vmodel()

    for j in range(0, len(test1_sentences)):
        sentence = ss.similarity(test1_sentences[j], j)
'''    # 测试集1
    right_count = 0
예제 #15
0
# import modules & set up logging
import gensim, logging
from fileObject import FileObj
from gensim.models import Word2Vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

if __name__ == '__main__':

    file_obj = FileObj(r"testSet/data")
    sentences = file_obj.read_lines_1_words()
    #model = Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
    #model.save('w2v_model')
    model = Word2Vec.load('w2v_model')
    print(model.most_similar(['怀孕']))
    print(model.similarity('怀孕', '孕妇'))
예제 #16
0
파일: xzy.py 프로젝트: fyl0109/DataScience
#encoding=utf-8

from zhcnSegment import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence

if __name__ == '__main__':
    # 读入训练集
    file_obj = FileObj(r"testSet/num-tjms.txt")
    train_sentences = file_obj.read_lines()

    test1_sentences=input("请输入关键词:")
    #test1_sentences = "T波 异常"

    # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
    seg = Seg()

    # 训练模型
    ss = SentenceSimilarity(seg)
    ss.set_sentences(train_sentences)
    ss.TfidfModel()         # tfidf模型
    # ss.LsiModel()         # lsi模型
    # ss.LdaModel()         # lda模型

    # 测试集1
    right_count = 0
    sentence = ss.similarity(test1_sentences)
    print (sentence.origin_sentence)