def trainAndTest(trains, tests, result_file, law_index, lenTP, lenTN, lenP, lenN, log_file): maxent.me_classify(trains, tests, result_file) # 训练 + 测试 pred_prob, pred_label, real_label = maxent.getPredProb(result_file) # 解析结果 # law_index 为法律条文的编号(顺序),可在new_all_pieces_count_dict2.txt中查看具体的法律名称 acc = maxent.createPRF_me(pred_label, real_label, law_index, lenTP, lenTN, lenP, lenN, log_file) print acc return pred_prob, pred_label, real_label, acc
def run(seed): domain = createDomain() l0 = len(domain[0]) l1 = len(domain[1]) l2 = len(domain[2]) l3 = len(domain[3]) l4 = len(domain[4]) l5 = len(domain[5]) l6 = len(domain[6]) print('len(e0):' + str(len(domain[0]))) print('len(e1):' + str(len(domain[1]))) print('len(e2):' + str(len(domain[2]))) print('len(e3):' + str(len(domain[3]))) # 472 470 0.8 376 print('len(e4):' + str(len(domain[4]))) print('len(e5):' + str(len(domain[5]))) print('len(e6):' + str(len(domain[6]))) # docs=domain[0]+domain[1]+domain[2]+domain[3]+domain[4]+domain[5]+domain[6] # trains=domain[0][int(l0*0.2):]+domain[1][int(l1*0.2):]+domain[2][int(l2*0.2):]+\ # domain[3][int(l3*0.2):]+domain[4][int(l4*0.2):]+domain[5][int(l5*0.2):]+domain[6][int(l6*0.2):] tests=domain[0][:72]+domain[1][:72]+domain[2][:72]+domain[3][:72]+domain[4][:72]\ +domain[5][:72]+domain[6][:72] domain_2 = [] # re-sampling for j in range(6): for i in range(2204 - len(domain[j])): domain[j].append(random.choice(domain[j][72:])) # 测试样本 固定每类别的前90 *7 trains=domain[0][72:]+domain[1][72:]+domain[2][72:]+domain[3][72:]+domain[4][72:]\ +domain[5][72:]+domain[6][72:] # for item in domain: # random.shuffle(item) # 训练样本 随机采样 360 *7 # trains=random.sample(domain[0][72:],288)+random.sample(domain[1][72:],288)+random.sample(domain[2][72:],288)\ # +random.sample(domain[3][72:],288)+random.sample(domain[4][72:],288)\ # +random.sample(domain[5][72:],288)+random.sample(domain[6][72:],288) # random.shuffle(trains) # tests=domain[0][:int(l0*0.2)]+domain[1][:int(l1*0.2)]+domain[2][:int(l2*0.2)]+domain[3][:int(l3*0.2)]+\ # domain[4][:int(l4*0.2)]+domain[5][:int(l5*0.2)]+domain[6][:int(l6*0.2)] #random.shuffle(tests) #random.shuffle(docs) # trains=docs[:int(len(docs)*0.8)] # tests=docs[int(len(docs)*0.8):] print('len(trains):' + str(len(trains))) print('len(tests):' + str(len(tests))) # lexcion=maxent.get_lexcion(trains) # print('len(lexcion):'+str(len(lexcion))) maxent.me_classify(trains, tests) #maxent.createResult(tests,'result.txt') # acc=maxent.createResult2('result.txt') acc, G_mean = maxent.createPRF('result.txt', seed) return acc, G_mean
def stackingCombined(trains,tests,classifies,fold=10): subLen=len(trains)//fold vectors=[] for i in range(fold): resultsList=[] subTrains=[] subTests=[] for j in range(fold): if j==i: subTests+=trains[j*subLen:(j+1)*subLen] else: subTrains+=trains[j*subLen:(j+1)*subLen] for classify in classifies: resultsList.append(classify(subTrains,subTests)[1]) for i,subTest in enumerate(subTests): vector=CDocument(subTest.polarity,{}) for j in range(len(resultsList)): posProb=resultsList[j][i] if posProb>0: negProb=1-posProb elif posProb<0: negProb=abs(posProb) posProb=1-negProb else: negProb=posProb=0 vector.words[str(j*2)]=posProb vector.words[str(j*2+1)]=negProb vectors.append(vector) vTrains=vectors resultsList=[] for classify in classifies: resultsList.append(classify(trains,tests)[1]) vTests=[] for i,test in enumerate(tests): vector=CDocument(test.polarity,{}) for j in range(len(resultsList)): posProb=resultsList[j][i] if posProb>0: negProb=1-posProb elif posProb<0: negProb=abs(posProb) posProb=1-negProb else: negProb=posProb=0 vector.words[str(j*2)]=posProb vector.words[str(j*2+1)]=negProb vTests.append(vector) acc,results=me_classify(vTrains,vTests) print 'combined results: %f' % acc return acc,results
def classify_sentiment(pTrains,pTests): cn_lexicon=CnSentimentLexicon() en_lexicon=EnSentimentLexicon() trains=[] tests=[] for label,p in pTrains: words=getSentimentFeatures(p,cn_lexicon,en_lexicon) trains.append(CDocument(label,words)) for label,p in pTests: words=getSentimentFeatures(p,cn_lexicon,en_lexicon) tests.append(CDocument(label,words)) return me_classify(trains,tests)
def classify_translate_cerelation(pTrains,pTests): dict=CEDict() pmi=PMI() trains=[] tests=[] for label,p in pTrains: words=getTranlateFeaturesCERelation(p,dict,pmi) trains.append(CDocument(label,words)) for label,p in pTests: words=getTranlateFeaturesCERelation(p,dict,pmi) tests.append(CDocument(label,words)) return me_classify(trains,tests)
def classify_translate_simple(pTrains,pTests): dict=CEDict() syn=Synonym() # lm=LanguageModel() trains=[] tests=[] for label,p in pTrains: words=getTranlateFeatures(p,dict) trains.append(CDocument(label,words)) for label,p in pTests: words=getTranlateFeatures(p,dict) tests.append(CDocument(label,words)) return me_classify(trains,tests)
def classify_en(pTrains,pTests): trains=[CDocument(label,p.en) for label,p in pTrains] tests=[CDocument(label,p.en) for label,p in pTests] return me_classify(trains,tests)
def classify_all(pTrains,pTests): trains=[CDocument(label,p.words) for label,p in pTrains] tests=[CDocument(label,p.words) for label,p in pTests] return me_classify(trains,tests)
def run(trains,tests,filename,seed): maxent.me_classify(trains,tests,filename) pred_prob,pred_label,real_label=maxent.getPredProb(filename) acc,gmean=maxent.createPRF(pred_label,real_label,seed,'per_crf.txt') return pred_prob,real_label
#! /usr/bin/env python #coding=utf-8 from __future__ import division from document import createDomain from maxent import me_classify domain=createDomain('kitchen') trains=domain[0][200:]+domain[1][200:] tests=domain[0][:200]+domain[1][:200] me_classify(trains,tests)
#! /usr/bin/env python #coding=utf-8 from __future__ import division from document import createDomain from maxent import me_classify domain=createDomain('kitchen') trains=domain[0][200:]+domain[1][200:] tests=domain[0][:200]+domain[1][:200] acc,results=me_classify(trains,tests) print acc print results[:10]
def run(trains, tests): maxent.me_classify(trains, tests)