def getTrainTextList():
    rawFileName='../data/rawtrain.txt'
    trainFileName='../data/train.txt'
    validateFileName='../data/validate.txt'

    #pp.splitfile(rawFileName,trainFileName,validateFileName)

    str=''
    trainlist=[]
    # train #################################
    filein=codecs.open(trainFileName,'r','utf-8')
    in_reader=filein.readlines()
    i=0
    for line in in_reader:
        content=pp.getcontent(in_reader,i)
        i=i+1
        if(i%5000==0):
            print ("%d "%(i))+'#'*30
        #if(i>10):
            #break
        if(content==''):
            print line
        else:
            str=content.split('\t')
            len=str[0].__len__()
            cuttext=jieba.cut(content[len+3:])
            jointext=' '.join(cuttext)
            trainstr=(str[1],jointext)
            trainlist.append(trainstr)

    filein.close()
    return trainlist
예제 #2
0
def getTestTextList(cutModel=False):
    testFileName='../data/test.txt'

    filetest=codecs.open(testFileName,'r','utf-8')
    test_reader=filetest.readlines()
    testTextList=[]

    i=0
    for line in test_reader:
        content=pp.getcontent(test_reader,i)
        i=i+1
        #if(i>10):
            #break
        if(i%5000==0):
            print ("%d "%(i))+'#'*30

        if(content==''):
            print "test.py#"*3+line
        else:
            str=content.split('\t')
            len=str[0].__len__()
            #result=pipeline.predict(content[len+1:])
            cuttext=jieba.cut(content[len+1:].strip(),cut_all=cutModel)
            jointext=' '.join(cuttext)
            testTextList.append(jointext)

    filetest.close()
    return testTextList
# test ##################################
#grocery=Grocery('sample')
grocery = Grocery('version1.0')
grocery.load()

print 'start test'

filetest = codecs.open(testFileName, 'r', 'utf-8')
test_reader = filetest.readlines()

fileOutput = codecs.open(outputFileName, 'w', 'utf-8')

i = 0
for line in test_reader:
    content = pp.getcontent(test_reader, i)
    i = i + 1
    #if(i>10):
    #break
    if (i % 5000 == 0):
        print("%d " % (i)) + '#' * 30

    if (content == ''):
        print "test.py#" * 3 + line
    else:
        str = content.split('\t')
        len = str[0].__len__()
        result = grocery.predict(content[len + 1:])
        fileOutput.write(str[0] + ',' + result + '\n')

filetest.close()
예제 #4
0
print 'start test'
TP = 0.0
TN = 0.0
FP = 0.0
FN = 0.0

fileValidate = codecs.open(validateFileName, 'r', 'utf-8')
validate_reader = fileValidate.readlines()

fileOutput = codecs.open(outputFileName, 'w', 'utf-8')

resultlist = []
i = 0
for line in validate_reader:
    content = pp.getcontent(validate_reader, i)
    i = i + 1
    if (i % 5000 == 0):
        print("%d " % (i)) + '#' * 30
    #if(i>10):
    #break
    if (content == ''):
        print line
    else:
        str = content.split('\t')
        len = str[0].__len__()
        result = grocery.predict(content[len + 3:])
        if (result == str[1]):
            if (str[1] == u'0'):
                TN = TN + 1
            else:
print 'start test'
TP=0.0
TN=0.0
FP=0.0
FN=0.0

fileValidate=codecs.open(validateFileName,'r','utf-8')
validate_reader=fileValidate.readlines()

fileOutput=codecs.open(outputFileName,'w','utf-8')

resultlist=[]
i=0
for line in validate_reader:
    content=pp.getcontent(validate_reader,i)
    i=i+1
    if(i%5000==0):
        print ("%d "%(i))+'#'*30
    #if(i>10):
        #break
    if(content==''):
        print line
    else:
        str=content.split('\t')
        len=str[0].__len__()
        result=grocery.predict(content[len+3:])
        if(result==str[1]):
            if(str[1]==u'0'):
                TN=TN+1
            else:
# test ##################################
#grocery=Grocery('sample')
grocery=Grocery('version1.0')
grocery.load()

print 'start test'

filetest=codecs.open(testFileName,'r','utf-8')
test_reader=filetest.readlines()

fileOutput=codecs.open(outputFileName,'w','utf-8')

i=0
for line in test_reader:
    content=pp.getcontent(test_reader,i)
    i=i+1
    #if(i>10):
        #break
    if(i%5000==0):
        print ("%d "%(i))+'#'*30

    if(content==''):
        print "test.py#"*3+line
    else:
        str=content.split('\t')
        len=str[0].__len__()
        result=grocery.predict(content[len+1:])
        fileOutput.write(str[0]+','+result+'\n')

filetest.close()