def tf(): dt = {} # if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"train_data.txt") train_sentences = file_obj.read_lines() # 读入测试集 file_obj = FileObj(r"test_data.txt") test1_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1 right_count = 0 # w=open("result510tf.txt",'w') # w.write(str("source_id") + '\t' + str("target_id") + '\n') for i in range(len(test1_sentences)): print "*********************" print i print test1_sentences[i] test = str(test1_sentences[i].encode("utf-8")) t = test.split(',')[0] dict = ss.similarity(test1_sentences[i]) # dict的key为句子的(序号-1),value为计算出的距离 for k, v in dict: print t, k + 1, v # 如2784 2784 1.0 ind2 = k + 1 if (str(k + 1) == str(t)): print "same" else: # w.write(str(t) + '\t' + str(k+1) + '\n') addtodict2(dt, int(t), int(ind2), v) # w.close() return dt
def run_prediction(input_file_path, output_file_path): # 读入训练集 file_obj = FileObj(r"./TFIDF_baseline/dataSet/trainQuestions.txt") train_sentences = file_obj.read_lines() # 读入测试集 file_obj = FileObj(input_file_path) test_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,并去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # 测试集 right_count = 0 file_result=open(output_file_path,'w') with open("./TFIDF_baseline/dataSet/trainAnswers.txt",'r',encoding = 'utf-8') as file_answer: line = file_answer.readlines() for i in range(0,len(test_sentences)): top_15 = ss.similarity(test_sentences[i]) ''' for j in range(0,len(top_15)): answer_index=top_15[j][0] answer=line[answer_index] file_result.write(str(top_15[j][1])+'\t'+str(answer)) file_result.write("\n") ''' file_result.write(line[top_15[0][0]]+'\n') file_result.close() file_answer.close()
# encoding=utf-8 from Segment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence from time import clock if __name__ == '__main__': start = clock() # 读入后半部分语料 file_obj = FileObj(r"sentence2.txt") train_sentences = file_obj.read_lines() # 读入前半部分语料 file_obj = FileObj(r"sentence1.txt") test1_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,加了一次封装,主要是去除停用词 seg = Seg() # 生成模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) #ss.TfidfModel() # tfidf模型 ss.LsiModel() # lsi模型 #ss.LdaModel() # lda模型 # 计算句子相似度 # for i in range(0,len(train_sentences)/100): # mysims = ss.mysimilarity(test1_sentences[i*100])
if it[0] not in uselessTag: if not useStopWord: word_list.append(tagdict['word'][index]) elif tagdict['word'][index] not in self.stopwords: word_list.append(tagdict['word'][index]) return word_list def cut(self,sentences): """ 分词 :param sentences:需要分词的语料集 :return: 去噪后的单词list """ tags=self.get_tags(sentences) cutedSentences=[] for sentence in tags: cutedSentences.append(self.denoisingOne(sentence)) return cutedSentences def depenPars(self,sentences): return self.nlp.depparser(sentences) if __name__=="__main__": from fileObject import FileObj Fobj=FileObj(r"testSet/trainSet.txt") scentences1 = Fobj.read_lines() cutTool=CNSegment() lst=cutTool.depenPars(scentences1[0]) print json.dumps(lst,encoding="UTF-8", ensure_ascii=False)
#encoding=utf-8 from cutWords import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence from tqdm import tqdm if __name__ == '__main__': # 读入训练集 # file_obj = FileObj(r"dataSet/trainQuestions.txt") # train_sentences = file_obj.read_lines() # 读入测试集 file_obj = FileObj(r"dataSet/devQuestions.txt") test_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,并去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(test_sentences) ss.TfidfModel() # tfidf模型 # 测试集 right_count = 0 file_result = open('dataSet/result.txt', 'w') with open("dataSet/trainAnswers.txt", 'r', encoding='utf-8') as file_answer:
#encoding=utf-8 from zhcnSegment import * from fileObject import FileObj from sentenceSimilarity import SentenceSimilarity from sentence import Sentence if __name__ == '__main__': # 读入训练集 file_obj = FileObj(r"testSet/trainSet.txt") train_sentences = file_obj.read_lines() # 读入测试集1 file_obj = FileObj(r"testSet/testSet1.txt") test1_sentences = file_obj.read_lines() # 读入测试集2 file_obj = FileObj(r"testSet/testSet2.txt") test2_sentences = file_obj.read_lines() # 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词 seg = Seg() # 训练模型 ss = SentenceSimilarity(seg) ss.set_sentences(train_sentences) ss.TfidfModel() # tfidf模型 # ss.LsiModel() # lsi模型 # ss.LdaModel() # lda模型 # 测试集1