예제 #1
0
def self_training(attribute,iterate_count,initial_data_count,new_data_count):
    from data_constructor import construct
    print ''

    construct(attribute,initial_data_count)
    unlabel_train_x,unlabel_train_y,unlabel_train_uids=get_data(attribute,'train_unlabel')
    train_x,train_y,train_uids=get_data(attribute,'train')
    test_x,test_y,_=get_data(attribute,'test')

    scores=[]
    for i in xrange(iterate_count):
        print '----------------'
        print 'Iterate: %d'%i
        print 'Labeled training data size: %d'%(len(train_x))
        print 'Unlabeled training data size: %d'%(len(unlabel_train_x))
        print 'Testing data size: %d'%(len(test_x))
        clf=MultinomialNB()
        clf.fit(train_x,train_y)
        score=clf.score(test_x,test_y)
        print 'Accurate: %0.4f'%score
        scores.append(score)
        result=clf.predict_proba(unlabel_train_x)
        good_x,good_y,bad_x,bad_y=extract_new_data(zip(unlabel_train_x,result),new_data_count)
        if len(good_x)==0:
            print 'No more new train data!'
            break
        print 'New training data size: %d'%(len(good_x))
        train_x=numpy.concatenate((train_x, good_x), axis=0)
        train_y=numpy.concatenate((train_y, good_y), axis=0)
        unlabel_train_x,unlabel_train_y=bad_x,bad_y
    print '--------'
    for s in scores:
        print s
    print '--------'
예제 #2
0
파일: learn.py 프로젝트: Adoni/JD_Profiling
def iterate_learn(attribute, iterate_count, training_count):
    from data_constructor import construct
    fout = open(
        base_dir + '/label2trainset/iterate_result_%s.result' % attribute, 'w')
    accurates = []
    for i in xrange(iterate_count):
        print i
        construct(attribute, training_count)
        print ''
        accurate = learn(attribute)
        fout.write('%d %0.4f\n' % (i, accurate))
        print accurate
        accurates.append(accurate)
        label_distribute = statistics_after_train(attribute,
                                                  method='label2trainset',
                                                  threshold=400,
                                                  show=False,
                                                  feature_file_name=base_dir +
                                                  '/features/mention.feature')
        label_distribute = sorted(
            label_distribute.items(),
            key=lambda d: 1.0 * abs(d[1][0] - d[1][1]) / sum(d[1]),
            reverse=True)
        update_labeled_feature(attribute, label_distribute)
    return accurates
예제 #3
0
파일: learn.py 프로젝트: Adoni/JD_Profiling
def iterate_learn(attribute,iterate_count):
    from data_constructor import construct
    fout=open(base_dir+'/label2trainset/iterate_result_%s.result'%attribute,'w')
    for i in xrange(iterate_count):
        print i
        construct(attribute)
        print ''
        accurate=learn(attribute)
        fout.write('%d %0.4f\n'%(i,accurate))
        print accurate
        label_distribute=statistics_after_train(attribute,method='label2trainset',threshold=2000,show=False,feature_file_name=base_dir+'/features/all_features.feature')
        label_distribute=sorted(label_distribute,key=lambda d:1.0*abs(d[1][0]-d[1][1])/sum(d[1]), reverse=True)
        update_labeled_feature(attribute,label_distribute)
예제 #4
0
def iterate_learn(attribute,iterate_count,initial_data_count,new_data_count):
    from data_constructor import construct
    print 'Attribute: %s'%attribute
    fout=open(base_dir+'/mylabel2trainset/iterate_result_%s.result'%attribute,'w')
    accurates=[]
    for i in xrange(iterate_count):
        construct(attribute,initial_data_count+i*new_data_count)
        print ''
        print '============'
        print 'Iterate: %d'%i
        print '============'
        accurate=learn(attribute)
        fout.write('%d %f\n'%(i,accurate))
        accurates.append(accurate)
        label_distribute=statistics_after_train(attribute,method='mylabel2trainset',threshold=10,show=False,feature_file_name=base_dir+'/features/all_features.feature')
        threshold=0.7
        label_distribute=filter(lambda d:1.0*max(d[1])/sum(d[1])>threshold, label_distribute.items())
        label_distribute=sorted(label_distribute,key=lambda d:1.0*max(d[1])/sum(d[1]), reverse=True)
        update_labeled_feature(attribute,label_distribute,max_count=2)
    return accurates
예제 #5
0
def iterate_learn(attribute,iterate_count,initial_data_count,new_data_count):
    from data_constructor import construct
    print 'Attribute: %s'%attribute
    fout=open(base_dir+'/iterate_result_%s.result'%attribute,'w')
    fbest=open(base_dir+'/best_accurate_%s.result'%attribute,'a')
    best_accurate=0.0
    for i in xrange(iterate_count):
        training_count=initial_data_count+i*new_data_count
        construct(attribute,training_count)
        print ''
        print '============'
        print 'Iterate: %d'%i
        print '============'
        accurate,result=learn(attribute)
        best_accurate=max(best_accurate,accurate)
        fout.write('%d %f\n'%(i,accurate))
        #labels=get_labels(result)
        #score,feature_distribute=statistics(labels=labels,feature_file_name=base_dir+'/features/all_features.feature',threshold=new_data_count)
        #score,feature_distribute=statistics(labels=labels,feature_file_name=base_dir+'/features/all_features.feature',threshold=10)
        #update_labeled_feature(attribute,score,feature_distribute,max_count=1)
    fbest.write('%d %f\n'%(new_data_count,best_accurate))
    fout.close()
    fbest.close()
예제 #6
0
def iterate_learn(attribute, iterate_count, initial_data_count,
                  new_data_count):
    from data_constructor import construct
    print 'Attribute: %s' % attribute
    fout = open(base_dir + '/iterate_result_%s.result' % attribute, 'w')
    fbest = open(base_dir + '/best_accurate_%s.result' % attribute, 'a')
    best_accurate = 0.0
    for i in xrange(iterate_count):
        training_count = initial_data_count + i * new_data_count
        construct(attribute, training_count)
        print ''
        print '============'
        print 'Iterate: %d' % i
        print '============'
        accurate, result = learn(attribute)
        best_accurate = max(best_accurate, accurate)
        fout.write('%d %f\n' % (i, accurate))
        #labels=get_labels(result)
        #score,feature_distribute=statistics(labels=labels,feature_file_name=base_dir+'/features/all_features.feature',threshold=new_data_count)
        #score,feature_distribute=statistics(labels=labels,feature_file_name=base_dir+'/features/all_features.feature',threshold=10)
        #update_labeled_feature(attribute,score,feature_distribute,max_count=1)
    fbest.write('%d %f\n' % (new_data_count, best_accurate))
    fout.close()
    fbest.close()