def cross_validation(number_of_features): import sys import codecs from Model import Topic, Article from Topics import Topics from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8') Session=sessionmaker(bind=engine) session=Session() topics=Topics(session, CosineSimilarityClassifier(number_of_features)) topics.print_out() print >>sys.stderr,'Adding articles' i=0 for line in open('../data/train.dat'): id=int(line) topics.add_article(id) i+=1 # if i==200: # break topics.print_out() topics.build_index( "../data/lang/lemmatization.dat", "../data/lang/stopwords.dat") topics.train_classifier() correct=0.0 number_of_articles=0.0 confusion_matrix={} for line in open('../data/test.dat'): article=session.query(Article).filter(Article.id==int(line)).one() assigned_topic=topics.assign_topic_to_article(article.text) print number_of_articles,article.title try: target_topic=topics.get_topics()[article.topic.id].get_parent_at_level(2) if target_topic.id not in confusion_matrix: confusion_matrix[target_topic.id]={} confusion_matrix[target_topic.id][assigned_topic.id]=confusion_matrix[target_topic.id].get(assigned_topic.id,0)+1 if target_topic.id==assigned_topic.id: correct+=1 print ' + ',target_topic.name,' / ',assigned_topic.name else: print ' - ',target_topic.name,' / ',assigned_topic.name number_of_articles+=1 print 'Accuracy:',(correct/number_of_articles)*100,'% ' except: print sys.exc_info() pass # if number_of_articles>50: # break print 'Accuracy:',(correct/number_of_articles)*100,'% ' precision={} for t in confusion_matrix: print topics.get_topics()[t].name true_pos=confusion_matrix[t].get(t,0) tp_fp=sum([confusion_matrix[c].get(t,0) for c in confusion_matrix]) if tp_fp==0: precision[t]=1 else: precision[t]=float(true_pos)/tp_fp recall={} for t in confusion_matrix: true_pos=confusion_matrix[t].get(t,0) tp_fn=sum([confusion_matrix[t].get(c,0) for c in confusion_matrix]) if tp_fn==0: recall[t]=1 else: recall[t]=float(true_pos)/tp_fn conf_matrix_out=codecs.open('confusion_matrix'+str(number_of_features)+'.csv',encoding='utf-8',mode='w') conf_matrix_out.write(',') for a in confusion_matrix: conf_matrix_out.write('%s,'%topics.get_topics()[a].name) conf_matrix_out.write('Recall\n') for a in confusion_matrix: conf_matrix_out.write('%s,'%topics.get_topics()[a].name) for b in confusion_matrix: conf_matrix_out.write('%s,'%confusion_matrix[a].get(b,0)) conf_matrix_out.write('%s\n'%recall[a]) conf_matrix_out.write('Precision,') for b in confusion_matrix: conf_matrix_out.write('%s,'%precision[b]) conf_matrix_out.write('\n\n') conf_matrix_out.write('F-measure\n') av_fm=0 for b in confusion_matrix: if precision[b]==0 or recall[b]==0: fm=0 else: fm=2*precision[b]*recall[b]/(precision[b]+recall[b]) conf_matrix_out.write('%s,%s\n'%(topics.get_topics()[b].name,fm)) av_fm+=fm av_fm/=len(confusion_matrix) conf_matrix_out.write('\n') conf_matrix_out.write('Average F-measure,%s\n'%av_fm) conf_matrix_out.write('Accuracy,%s\n'%(correct/number_of_articles))
def cross_validation(): import psyco psyco.full() import sys import codecs from Model import Topic, Article from Topics import Topics from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker engine=create_engine('mysql://*****:*****@localhost/data_mining?use_unicode=1&charset=utf8') Session=sessionmaker(bind=engine) session=Session() topics=Topics(session, KNearestNeighborClassifier()) topics.print_out() print >>sys.stderr,'Adding articles' i=0 for line in open('../data/train.dat'): id=int(line) topics.add_article(id) i+=1 if i==10000: break topics.build_index( "../data/lang/lemmatization.dat", "../data/lang/stopwords.dat") topics.train_classifier() correct=0.0 number_of_articles=0.0 confusion_matrix={} for line in open('../data/test.dat'): article=session.query(Article).filter(Article.id==int(line)).one() assigned_topic=topics.assign_topic_to_article(article.text) print number_of_articles,article.title try: target_topic=topics.get_topics()[article.topic.id].get_parent_at_level(2) if target_topic.id not in confusion_matrix: confusion_matrix[target_topic.id]={} confusion_matrix[target_topic.id][assigned_topic.id]=confusion_matrix[target_topic.id].get(assigned_topic.id,0)+1 if target_topic.id==assigned_topic.id: correct+=1 print ' + ',target_topic.name,' / ',assigned_topic.name else: print ' - ',target_topic.name,' / ',assigned_topic.name number_of_articles+=1 print 'Accuracy:',(correct/number_of_articles)*100,'% ' except: print sys.exc_info() pass # if number_of_articles>1000: # break print 'Accuracy:',(correct/number_of_articles)*100,'% ' conf_matrix_out=codecs.open('confusion_matrix.csv',encoding='utf-8',mode='w') conf_matrix_out.write(',') for a in confusion_matrix: conf_matrix_out.write('%s,'%topics.get_topics()[a].name) conf_matrix_out.write('\n') for a in confusion_matrix: conf_matrix_out.write('%s,'%topics.get_topics()[a].name) for b in confusion_matrix: conf_matrix_out.write('%s,'%confusion_matrix[a].get(b,0)) conf_matrix_out.write('\n')