def self_training(attribute,iterate_count,initial_data_count,new_data_count): from data_constructor import construct print '' construct(attribute,initial_data_count) unlabel_train_x,unlabel_train_y,unlabel_train_uids=get_data(attribute,'train_unlabel') train_x,train_y,train_uids=get_data(attribute,'train') test_x,test_y,_=get_data(attribute,'test') scores=[] for i in xrange(iterate_count): print '----------------' print 'Iterate: %d'%i print 'Labeled training data size: %d'%(len(train_x)) print 'Unlabeled training data size: %d'%(len(unlabel_train_x)) print 'Testing data size: %d'%(len(test_x)) clf=MultinomialNB() clf.fit(train_x,train_y) score=clf.score(test_x,test_y) print 'Accurate: %0.4f'%score scores.append(score) result=clf.predict_proba(unlabel_train_x) good_x,good_y,bad_x,bad_y=extract_new_data(zip(unlabel_train_x,result),new_data_count) if len(good_x)==0: print 'No more new train data!' break print 'New training data size: %d'%(len(good_x)) train_x=numpy.concatenate((train_x, good_x), axis=0) train_y=numpy.concatenate((train_y, good_y), axis=0) unlabel_train_x,unlabel_train_y=bad_x,bad_y print '--------' for s in scores: print s print '--------'
def iterate_learn(attribute, iterate_count, training_count): from data_constructor import construct fout = open( base_dir + '/label2trainset/iterate_result_%s.result' % attribute, 'w') accurates = [] for i in xrange(iterate_count): print i construct(attribute, training_count) print '' accurate = learn(attribute) fout.write('%d %0.4f\n' % (i, accurate)) print accurate accurates.append(accurate) label_distribute = statistics_after_train(attribute, method='label2trainset', threshold=400, show=False, feature_file_name=base_dir + '/features/mention.feature') label_distribute = sorted( label_distribute.items(), key=lambda d: 1.0 * abs(d[1][0] - d[1][1]) / sum(d[1]), reverse=True) update_labeled_feature(attribute, label_distribute) return accurates
def iterate_learn(attribute,iterate_count): from data_constructor import construct fout=open(base_dir+'/label2trainset/iterate_result_%s.result'%attribute,'w') for i in xrange(iterate_count): print i construct(attribute) print '' accurate=learn(attribute) fout.write('%d %0.4f\n'%(i,accurate)) print accurate label_distribute=statistics_after_train(attribute,method='label2trainset',threshold=2000,show=False,feature_file_name=base_dir+'/features/all_features.feature') label_distribute=sorted(label_distribute,key=lambda d:1.0*abs(d[1][0]-d[1][1])/sum(d[1]), reverse=True) update_labeled_feature(attribute,label_distribute)
def iterate_learn(attribute,iterate_count,initial_data_count,new_data_count): from data_constructor import construct print 'Attribute: %s'%attribute fout=open(base_dir+'/mylabel2trainset/iterate_result_%s.result'%attribute,'w') accurates=[] for i in xrange(iterate_count): construct(attribute,initial_data_count+i*new_data_count) print '' print '============' print 'Iterate: %d'%i print '============' accurate=learn(attribute) fout.write('%d %f\n'%(i,accurate)) accurates.append(accurate) label_distribute=statistics_after_train(attribute,method='mylabel2trainset',threshold=10,show=False,feature_file_name=base_dir+'/features/all_features.feature') threshold=0.7 label_distribute=filter(lambda d:1.0*max(d[1])/sum(d[1])>threshold, label_distribute.items()) label_distribute=sorted(label_distribute,key=lambda d:1.0*max(d[1])/sum(d[1]), reverse=True) update_labeled_feature(attribute,label_distribute,max_count=2) return accurates
def iterate_learn(attribute,iterate_count,initial_data_count,new_data_count): from data_constructor import construct print 'Attribute: %s'%attribute fout=open(base_dir+'/iterate_result_%s.result'%attribute,'w') fbest=open(base_dir+'/best_accurate_%s.result'%attribute,'a') best_accurate=0.0 for i in xrange(iterate_count): training_count=initial_data_count+i*new_data_count construct(attribute,training_count) print '' print '============' print 'Iterate: %d'%i print '============' accurate,result=learn(attribute) best_accurate=max(best_accurate,accurate) fout.write('%d %f\n'%(i,accurate)) #labels=get_labels(result) #score,feature_distribute=statistics(labels=labels,feature_file_name=base_dir+'/features/all_features.feature',threshold=new_data_count) #score,feature_distribute=statistics(labels=labels,feature_file_name=base_dir+'/features/all_features.feature',threshold=10) #update_labeled_feature(attribute,score,feature_distribute,max_count=1) fbest.write('%d %f\n'%(new_data_count,best_accurate)) fout.close() fbest.close()
def iterate_learn(attribute, iterate_count, initial_data_count, new_data_count): from data_constructor import construct print 'Attribute: %s' % attribute fout = open(base_dir + '/iterate_result_%s.result' % attribute, 'w') fbest = open(base_dir + '/best_accurate_%s.result' % attribute, 'a') best_accurate = 0.0 for i in xrange(iterate_count): training_count = initial_data_count + i * new_data_count construct(attribute, training_count) print '' print '============' print 'Iterate: %d' % i print '============' accurate, result = learn(attribute) best_accurate = max(best_accurate, accurate) fout.write('%d %f\n' % (i, accurate)) #labels=get_labels(result) #score,feature_distribute=statistics(labels=labels,feature_file_name=base_dir+'/features/all_features.feature',threshold=new_data_count) #score,feature_distribute=statistics(labels=labels,feature_file_name=base_dir+'/features/all_features.feature',threshold=10) #update_labeled_feature(attribute,score,feature_distribute,max_count=1) fbest.write('%d %f\n' % (new_data_count, best_accurate)) fout.close() fbest.close()