Пример #1
0
def trainAndTest(trains, tests, result_file, law_index, lenTP, lenTN, lenP,
                 lenN, log_file):
    maxent.me_classify(trains, tests, result_file)  # 训练 + 测试
    pred_prob, pred_label, real_label = maxent.getPredProb(result_file)  # 解析结果
    # law_index 为法律条文的编号(顺序),可在new_all_pieces_count_dict2.txt中查看具体的法律名称
    acc = maxent.createPRF_me(pred_label, real_label, law_index, lenTP, lenTN,
                              lenP, lenN, log_file)
    print acc
    return pred_prob, pred_label, real_label, acc
def run(seed):
    domain = createDomain()
    l0 = len(domain[0])
    l1 = len(domain[1])
    l2 = len(domain[2])
    l3 = len(domain[3])
    l4 = len(domain[4])
    l5 = len(domain[5])
    l6 = len(domain[6])
    print('len(e0):' + str(len(domain[0])))
    print('len(e1):' + str(len(domain[1])))
    print('len(e2):' + str(len(domain[2])))
    print('len(e3):' + str(len(domain[3])))  #  472  470 0.8 376
    print('len(e4):' + str(len(domain[4])))
    print('len(e5):' + str(len(domain[5])))
    print('len(e6):' + str(len(domain[6])))

    # docs=domain[0]+domain[1]+domain[2]+domain[3]+domain[4]+domain[5]+domain[6]
    # trains=domain[0][int(l0*0.2):]+domain[1][int(l1*0.2):]+domain[2][int(l2*0.2):]+\
    # 		domain[3][int(l3*0.2):]+domain[4][int(l4*0.2):]+domain[5][int(l5*0.2):]+domain[6][int(l6*0.2):]
    tests=domain[0][:72]+domain[1][:72]+domain[2][:72]+domain[3][:72]+domain[4][:72]\
      +domain[5][:72]+domain[6][:72]
    domain_2 = []  # re-sampling
    for j in range(6):
        for i in range(2204 - len(domain[j])):
            domain[j].append(random.choice(domain[j][72:]))
    # 测试样本  固定每类别的前90 *7
    trains=domain[0][72:]+domain[1][72:]+domain[2][72:]+domain[3][72:]+domain[4][72:]\
      +domain[5][72:]+domain[6][72:]

    # for item in domain:
    # 	random.shuffle(item)
    # 训练样本  随机采样 360 *7
    # trains=random.sample(domain[0][72:],288)+random.sample(domain[1][72:],288)+random.sample(domain[2][72:],288)\
    # 		+random.sample(domain[3][72:],288)+random.sample(domain[4][72:],288)\
    # 	     +random.sample(domain[5][72:],288)+random.sample(domain[6][72:],288)

    # random.shuffle(trains)
    # tests=domain[0][:int(l0*0.2)]+domain[1][:int(l1*0.2)]+domain[2][:int(l2*0.2)]+domain[3][:int(l3*0.2)]+\
    #  	   domain[4][:int(l4*0.2)]+domain[5][:int(l5*0.2)]+domain[6][:int(l6*0.2)]

    #random.shuffle(tests)
    #random.shuffle(docs)
    # trains=docs[:int(len(docs)*0.8)]
    # tests=docs[int(len(docs)*0.8):]
    print('len(trains):' + str(len(trains)))
    print('len(tests):' + str(len(tests)))

    # lexcion=maxent.get_lexcion(trains)
    # print('len(lexcion):'+str(len(lexcion)))

    maxent.me_classify(trains, tests)
    #maxent.createResult(tests,'result.txt')
    # acc=maxent.createResult2('result.txt')
    acc, G_mean = maxent.createPRF('result.txt', seed)
    return acc, G_mean
Пример #3
0
def stackingCombined(trains,tests,classifies,fold=10):
    subLen=len(trains)//fold
    vectors=[]
    for i in range(fold):
        resultsList=[]
        subTrains=[]
        subTests=[]
        for j in range(fold):
            if j==i:
                subTests+=trains[j*subLen:(j+1)*subLen]
            else:
                subTrains+=trains[j*subLen:(j+1)*subLen]

        for classify in classifies:
            resultsList.append(classify(subTrains,subTests)[1])
        
        for i,subTest in enumerate(subTests):
            vector=CDocument(subTest.polarity,{})
            for j in range(len(resultsList)):
                posProb=resultsList[j][i]
                if posProb>0:
                    negProb=1-posProb
                elif posProb<0:
                    negProb=abs(posProb)
                    posProb=1-negProb
                else:
                    negProb=posProb=0
                vector.words[str(j*2)]=posProb
                vector.words[str(j*2+1)]=negProb
            vectors.append(vector)
    vTrains=vectors
    
    resultsList=[]
    for classify in classifies:
        resultsList.append(classify(trains,tests)[1])
    vTests=[]
    for i,test in enumerate(tests):
        vector=CDocument(test.polarity,{})
        for j in range(len(resultsList)):
            posProb=resultsList[j][i]
            if posProb>0:
                negProb=1-posProb
            elif posProb<0:
                negProb=abs(posProb)
                posProb=1-negProb
            else:
                negProb=posProb=0
            vector.words[str(j*2)]=posProb
            vector.words[str(j*2+1)]=negProb
        vTests.append(vector)
    acc,results=me_classify(vTrains,vTests)
    
    print 'combined results: %f' % acc
    return acc,results
Пример #4
0
def classify_sentiment(pTrains,pTests):
    cn_lexicon=CnSentimentLexicon()
    en_lexicon=EnSentimentLexicon()
    
    trains=[]
    tests=[]
    
    for label,p in pTrains:
        words=getSentimentFeatures(p,cn_lexicon,en_lexicon)
        trains.append(CDocument(label,words))
    for label,p in pTests:
        words=getSentimentFeatures(p,cn_lexicon,en_lexicon)
        tests.append(CDocument(label,words))
    
    return me_classify(trains,tests)
Пример #5
0
def classify_translate_cerelation(pTrains,pTests):
    dict=CEDict()
    pmi=PMI()
    
    trains=[]
    tests=[]
    
    for label,p in pTrains:
        words=getTranlateFeaturesCERelation(p,dict,pmi)
        trains.append(CDocument(label,words))
    for label,p in pTests:
        words=getTranlateFeaturesCERelation(p,dict,pmi)
        tests.append(CDocument(label,words))
    
    return me_classify(trains,tests)
Пример #6
0
def classify_translate_simple(pTrains,pTests):
    dict=CEDict()
    syn=Synonym()
#    lm=LanguageModel()
    
    trains=[]
    tests=[]
    
    for label,p in pTrains:
        words=getTranlateFeatures(p,dict)
        trains.append(CDocument(label,words))
    for label,p in pTests:
        words=getTranlateFeatures(p,dict)
        tests.append(CDocument(label,words))
    
    return me_classify(trains,tests)
Пример #7
0
def classify_en(pTrains,pTests):
    trains=[CDocument(label,p.en) for label,p in pTrains]
    tests=[CDocument(label,p.en) for label,p in pTests]
    return me_classify(trains,tests)
Пример #8
0
def classify_all(pTrains,pTests):
    trains=[CDocument(label,p.words) for label,p in pTrains]    
    tests=[CDocument(label,p.words) for label,p in pTests]
    return me_classify(trains,tests)
def run(trains,tests,filename,seed):
	maxent.me_classify(trains,tests,filename)
	pred_prob,pred_label,real_label=maxent.getPredProb(filename)
	acc,gmean=maxent.createPRF(pred_label,real_label,seed,'per_crf.txt')
	return pred_prob,real_label
Пример #10
0
#! /usr/bin/env python
#coding=utf-8
from __future__ import division
from document import createDomain
from maxent import me_classify 

domain=createDomain('kitchen')
trains=domain[0][200:]+domain[1][200:]
tests=domain[0][:200]+domain[1][:200]
me_classify(trains,tests)
Пример #11
0
#! /usr/bin/env python
#coding=utf-8
from __future__ import division
from document import createDomain
from maxent import me_classify 

domain=createDomain('kitchen')
trains=domain[0][200:]+domain[1][200:]
tests=domain[0][:200]+domain[1][:200]
acc,results=me_classify(trains,tests)
print acc
print results[:10]
def run(trains, tests):
    maxent.me_classify(trains, tests)