def getTrainTextList(): rawFileName='../data/rawtrain.txt' trainFileName='../data/train.txt' validateFileName='../data/validate.txt' #pp.splitfile(rawFileName,trainFileName,validateFileName) str='' trainlist=[] # train ################################# filein=codecs.open(trainFileName,'r','utf-8') in_reader=filein.readlines() i=0 for line in in_reader: content=pp.getcontent(in_reader,i) i=i+1 if(i%5000==0): print ("%d "%(i))+'#'*30 #if(i>10): #break if(content==''): print line else: str=content.split('\t') len=str[0].__len__() cuttext=jieba.cut(content[len+3:]) jointext=' '.join(cuttext) trainstr=(str[1],jointext) trainlist.append(trainstr) filein.close() return trainlist
def getTestTextList(cutModel=False): testFileName='../data/test.txt' filetest=codecs.open(testFileName,'r','utf-8') test_reader=filetest.readlines() testTextList=[] i=0 for line in test_reader: content=pp.getcontent(test_reader,i) i=i+1 #if(i>10): #break if(i%5000==0): print ("%d "%(i))+'#'*30 if(content==''): print "test.py#"*3+line else: str=content.split('\t') len=str[0].__len__() #result=pipeline.predict(content[len+1:]) cuttext=jieba.cut(content[len+1:].strip(),cut_all=cutModel) jointext=' '.join(cuttext) testTextList.append(jointext) filetest.close() return testTextList
# test ################################## #grocery=Grocery('sample') grocery = Grocery('version1.0') grocery.load() print 'start test' filetest = codecs.open(testFileName, 'r', 'utf-8') test_reader = filetest.readlines() fileOutput = codecs.open(outputFileName, 'w', 'utf-8') i = 0 for line in test_reader: content = pp.getcontent(test_reader, i) i = i + 1 #if(i>10): #break if (i % 5000 == 0): print("%d " % (i)) + '#' * 30 if (content == ''): print "test.py#" * 3 + line else: str = content.split('\t') len = str[0].__len__() result = grocery.predict(content[len + 1:]) fileOutput.write(str[0] + ',' + result + '\n') filetest.close()
print 'start test' TP = 0.0 TN = 0.0 FP = 0.0 FN = 0.0 fileValidate = codecs.open(validateFileName, 'r', 'utf-8') validate_reader = fileValidate.readlines() fileOutput = codecs.open(outputFileName, 'w', 'utf-8') resultlist = [] i = 0 for line in validate_reader: content = pp.getcontent(validate_reader, i) i = i + 1 if (i % 5000 == 0): print("%d " % (i)) + '#' * 30 #if(i>10): #break if (content == ''): print line else: str = content.split('\t') len = str[0].__len__() result = grocery.predict(content[len + 3:]) if (result == str[1]): if (str[1] == u'0'): TN = TN + 1 else:
print 'start test' TP=0.0 TN=0.0 FP=0.0 FN=0.0 fileValidate=codecs.open(validateFileName,'r','utf-8') validate_reader=fileValidate.readlines() fileOutput=codecs.open(outputFileName,'w','utf-8') resultlist=[] i=0 for line in validate_reader: content=pp.getcontent(validate_reader,i) i=i+1 if(i%5000==0): print ("%d "%(i))+'#'*30 #if(i>10): #break if(content==''): print line else: str=content.split('\t') len=str[0].__len__() result=grocery.predict(content[len+3:]) if(result==str[1]): if(str[1]==u'0'): TN=TN+1 else:
# test ################################## #grocery=Grocery('sample') grocery=Grocery('version1.0') grocery.load() print 'start test' filetest=codecs.open(testFileName,'r','utf-8') test_reader=filetest.readlines() fileOutput=codecs.open(outputFileName,'w','utf-8') i=0 for line in test_reader: content=pp.getcontent(test_reader,i) i=i+1 #if(i>10): #break if(i%5000==0): print ("%d "%(i))+'#'*30 if(content==''): print "test.py#"*3+line else: str=content.split('\t') len=str[0].__len__() result=grocery.predict(content[len+1:]) fileOutput.write(str[0]+','+result+'\n') filetest.close()