def train():
    trainFileName='train.pkl'
    testFileName='test.pkl'
    pipelineFileName='pipeline.pkl'

    if(os.path.exists(trainFileName)):
        fin=open(trainFileName,'r')
        trainData=pickle.load(fin)
        trainClass=pickle.load(fin)
        fin.close()
    else:
        trainText=mydataset.getAllTrainTextList()
        i=0;
        N=trainText.__len__()
        trainData=[]
        trainClass=[]
        for (tag,text) in trainText:
            i=i+1
            if(i%5000==0):
                print('i=%08d finished %5.5f%% using jieba to cut the text\n'%(i,i*100.0/N))

            trainData.append(text)
            trainClass.append(tag)

        fout=open(trainFileName,'w')
        pickle.dump(trainData,fout)
        pickle.dump(trainClass,fout)
        fout.close()

    #if(os.path.exists(pipelineFileName)):
    if(False):
        fin=open(pipelineFileName,'r')
        pipeline=pickle.load(fin)
        fin.close()
    else:
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', Perceptron()),
        ])

        #pipeline.set_params(vect__max_df=0.6,clf__alpha=1e-07,clf__penalty='l2',tfidf__norm='l1',tfidf__use_idf=True,vect__ngram_range=(1,2))
        pipeline.set_params(vect__max_df=0.6,tfidf__norm='l1',tfidf__use_idf=True,vect__ngram_range=(1,2))
        trainNum=trainData.__len__()
        pipeline.fit(trainData[0:trainNum],trainClass[0:trainNum])

        fout=open(pipelineFileName,'w')
        pickle.dump(pipeline,fout)
        fout.close()


    #################################### output train result
    trainNum=trainData.__len__()
    #print 'train result '+"#"*30
    prec=pipeline.predict(trainData[0:trainNum])
    expected=trainClass[0:trainNum]

    #print("Classification report for classifier:\n%s\n"
          #% (metrics.classification_report(expected, prec)))

    TP=0.0
    TN=0.0
    FP=0.0
    FN=0.0

    N=trainData.__len__()
    for i in range(0,trainNum):
        if(prec[i]==expected[i]):
            if(prec[i]==u'1'):
                TP=TP+1
            else:
                TN=TN+1
        else:
            if(prec[i]==u'1'):
                FP=FP+1
            else:
                FN=FN+1

    P=TP/(TP+FP)
    R=TP/(TP+FN)
    F=2*P*R/(P+R)

    #print('train result: P=%f,R=%f,F=%f\n'%(P,R,F))

    return F,pipeline
#clf=Perceptron()

cutModel = True

if (cutModel):
    trainFileName = 'pipelineTrainCutAll.pkl'
else:
    trainFileName = 'pipelineTrain.pkl'

if (os.path.exists(trainFileName)):
    fin = open(trainFileName, 'r')
    trainData = pickle.load(fin)
    trainClass = pickle.load(fin)
    fin.close()
else:
    trainText = mydataset.getAllTrainTextList(cutModel)
    i = 0
    N = trainText.__len__()
    trainData = []
    trainClass = []
    for (tag, text) in trainText:
        i = i + 1
        if (i % 5000 == 0):
            print('i=%08d finished %5.5f%% using jieba to cut the text\n' %
                  (i, i * 100.0 / N))

        trainData.append(text)
        trainClass.append(tag)

    fout = open(trainFileName, 'w')
    pickle.dump(trainData, fout)
Exemplo n.º 3
0
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
import os
import mydataset

trainFileName='pipelineTrain.pkl'
if(os.path.exists(trainFileName)):
    fin=open(trainFileName,'r')
    trainData=pickle.load(fin)
    trainClass=pickle.load(fin)
    fin.close()
else:
    trainText=mydataset.getAllTrainTextList()
    i=0;
    N=trainText.__len__()
    trainData=[]
    trainClass=[]
    for (tag,text) in trainText:
        i=i+1
        if(i%5000==0):
            print('i=%08d finished %5.5f%% using jieba to cut the text\n'%(i,i*100.0/N))

        trainData.append(text)
        trainClass.append(tag)

    fout=open(trainFileName,'w')
    pickle.dump(trainData,fout)
    pickle.dump(trainClass,fout)
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
import os
import mydataset

cutModel=True

trainFileName='pipelineTrainCutAll.pkl'
if(os.path.exists(trainFileName)):
    fin=open(trainFileName,'r')
    trainData=pickle.load(fin)
    trainClass=pickle.load(fin)
    fin.close()
else:
    trainText=mydataset.getAllTrainTextList(cutModel)
    i=0;
    N=trainText.__len__()
    trainData=[]
    trainClass=[]
    for (tag,text) in trainText:
        i=i+1
        if(i%5000==0):
            print('i=%08d finished %5.5f%% using jieba to cut the text\n'%(i,i*100.0/N))

        trainData.append(text)
        trainClass.append(tag)

    fout=open(trainFileName,'w')
    pickle.dump(trainData,fout)
    pickle.dump(trainClass,fout)
def train(clf=SGDClassifier(class_weight='balanced')):
    trainFileName = 'train.pkl'
    testFileName = 'test.pkl'
    pipelineFileName = 'pipeline.pkl'

    if (os.path.exists(trainFileName)):
        fin = open(trainFileName, 'r')
        trainData = pickle.load(fin)
        trainClass = pickle.load(fin)
        fin.close()
    else:
        trainText = mydataset.getAllTrainTextList()
        i = 0
        N = trainText.__len__()
        trainData = []
        trainClass = []
        for (tag, text) in trainText:
            i = i + 1
            if (i % 5000 == 0):
                print('i=%08d finished %5.5f%% using jieba to cut the text\n' %
                      (i, i * 100.0 / N))

            trainData.append(text)
            trainClass.append(tag)

        fout = open(trainFileName, 'w')
        pickle.dump(trainData, fout)
        pickle.dump(trainClass, fout)
        fout.close()

    #if(os.path.exists(pipelineFileName)):
    if (False):
        fin = open(pipelineFileName, 'r')
        pipeline = pickle.load(fin)
        fin.close()
    else:
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', clf),
        ])

        pipeline.set_params(vect__max_df=0.6,
                            tfidf__norm='l1',
                            tfidf__use_idf=True,
                            vect__ngram_range=(1, 2))
        trainNum = trainData.__len__()
        pipeline.fit(trainData[0:trainNum], trainClass[0:trainNum])

        fout = open(pipelineFileName, 'w')
        pickle.dump(pipeline, fout)
        fout.close()

    #################################### output train result
    trainNum = trainData.__len__()
    #print 'train result '+"#"*30
    prec = pipeline.predict(trainData[0:trainNum])
    expected = trainClass[0:trainNum]

    #print("Classification report for classifier:\n%s\n"
    #% (metrics.classification_report(expected, prec)))

    TP = 0.0
    TN = 0.0
    FP = 0.0
    FN = 0.0

    N = trainData.__len__()
    for i in range(0, trainNum):
        if (prec[i] == expected[i]):
            if (prec[i] == u'1'):
                TP = TP + 1
            else:
                TN = TN + 1
        else:
            if (prec[i] == u'1'):
                FP = FP + 1
            else:
                FN = FN + 1

    P = TP / (TP + FP)
    R = TP / (TP + FN)
    F = 2 * P * R / (P + R)

    #print('train result: P=%f,R=%f,F=%f\n'%(P,R,F))

    return F, pipeline