Пример #1
0
def sample_level_evaluation(samples_file_name,targeting_levels):
    '''
        return:(rmse_mean,rmse_var,rmsre_mean,rmsre_var)
    '''
    pred_result = []
    for line in open(samples_file_name):
        line = line.strip('\n')
        line_array=line.strip('\n').split('\t')
        sample = line_array[0:-3]
        count = int(line_array[-3])
        old_sum = float(line_array[-2])
        sq_sum = float(line_array[-1])
        (is_success,pred_mean,pred_var,plotxy)=prediction.sample_predict(sample,targeting_levels)
        if is_success == True:
            pred_result.append({})
            pred_result[-1]['count'] = count
            pred_result[-1]['mean'] = float(old_sum)/count
            pred_result[-1]['var'] = float(sq_sum)/count - pred_result[-1]['mean']**2
            #var = var if var > 10e-8 else 10e-8
            pred_result[-1]['pred_mean'] = pred_mean
            pred_result[-1]['pred_var'] = pred_var
            #print mean,var,pred_mean,pred_var
    total_count = sum([d['count'] for d in pred_result])
    rmse_mean = math.sqrt(sum([(float(d['count'])/total_count) * ((d['mean']-d['pred_mean'])**2) for d in pred_result]))
    rmse_var = math.sqrt(sum([(float(d['count'])/total_count) * ((d['var']-d['pred_var'])**2) for d in pred_result]))
    rmsre_mean = math.sqrt(sum([(float(d['count'])/total_count) * (((d['mean']-d['pred_mean'])/d['mean'])**2) for d in pred_result]))
    #第一种处理方法:
    #精度处理,这里处理是为了计算相对误差时出现零除异常
    #for idx in range(0,len(pred_result)):
    #    tmp_var = pred_result[idx]['var']
    #    pred_result[idx]['var'] = tmp_var if tmp_var > 10e-8 else 10e-8
    #这种精度处理,会使得相对误差极大,因为又好多sample都是单一赢价
    #第二种处理方法,直接删除0var值sample:
    idx=0
    while idx < len(pred_result):
        #if pred_result[idx]['var'] == 0: #或者小于某个精度
        if pred_result[idx]['var'] < 10e-4: #或者小于某个精度
            del pred_result[idx]
        else:
            idx += 1
    total_count = sum([d['count'] for d in pred_result])
    rmsre_var = math.sqrt(sum([(float(d['count'])/total_count) * (((d['var']-d['pred_var'])/d['var'])**2) for d in pred_result]))
    return (rmse_mean,rmse_var,rmsre_mean,rmsre_var)
Пример #2
0
targeting_levels=targeting_levels_1

TemplateSelection.template_select(file_name_2+'expand_samples',file_name_1+'statistics_StarTree.pickle',ordered_selected_feat,selected_feat,10,file_name_2 + '_template_list.pickle',common_dir + '/TemplateSelection.log')
#
data_for_GBDT.generate(file_name_1+'statistics_StarTree.pickle',file_name_2 + '_template_list.pickle',file_name_2+'expand_samples','mean',2,10e-5,file_name_2 + 'GBDT_train_data_mean')
data_for_GBDT.generate(file_name_1+'statistics_StarTree.pickle',file_name_2 + '_template_list.pickle',file_name_2+'expand_samples','var',2,10e-5,file_name_2 + 'GBDT_train_data_var')
GBDT.GBDT(file_name_2 + 'GBDT_train_data_mean',[1,1,1,0],10,0.001,4,'mean',common_dir + '/GBDT_mean.pickle',common_dir + '/GBDT_mean.log')
GBDT.GBDT(file_name_2 + 'GBDT_train_data_var',[1,1,1,0],10,0.001,4,'var',common_dir + '/GBDT_var.pickle',common_dir + '/GBDT_var.log')
#


print targeting_levels
for idx in range(0,len(targeting_levels)):
    targeting_levels[idx].add('*')
#--- test in t3
(is_success,pred_mean,pred_var,plotxy)=prediction.sample_predict(['*','*','*'],targeting_levels,file_name_1+'statistics_StarTree.pickle',file_name_2 + '_template_list.pickle',common_dir + '/GBDT_mean.pickle',common_dir + '/GBDT_var.pickle')
(is_success,plotxy)=prediction.campaign_predict([['xxx','SOMA'],'*','*'],targeting_levels,file_name_1+'statistics_StarTree.pickle',file_name_2 + '_template_list.pickle',common_dir + '/GBDT_mean.pickle',common_dir + '/GBDT_var.pickle')

print targeting_levels


#----------------test---------------
mean_sq_error=0
var_sq_error =0
total_count =0
r_mean_sq_error=0#relative
r_var_sq_error =0#relative
for line in open(file_name_3 + 'statistics'):
    line_array=line.split('\t')
    sample = line_array[0:-3]
    count = int(line_array[-3])
Пример #3
0
                       file_name_2 + 'expand_samples', 'var', 2, 10e-5,
                       file_name_2 + 'GBDT_train_data_var')
GBDT.GBDT(file_name_2 + 'GBDT_train_data_mean', [1, 1, 1, 0], 10, 0.001, 4,
          'mean', common_dir + '/GBDT_mean.pickle',
          common_dir + '/GBDT_mean.log')
GBDT.GBDT(file_name_2 + 'GBDT_train_data_var', [1, 1, 1, 0], 10, 0.001, 4,
          'var', common_dir + '/GBDT_var.pickle', common_dir + '/GBDT_var.log')
#

print targeting_levels
for idx in range(0, len(targeting_levels)):
    targeting_levels[idx].add('*')
#--- test in t3
(is_success, pred_mean, pred_var, plotxy) = prediction.sample_predict(
    ['*', '*', '*'], targeting_levels,
    file_name_1 + 'statistics_StarTree.pickle',
    file_name_2 + '_template_list.pickle', common_dir + '/GBDT_mean.pickle',
    common_dir + '/GBDT_var.pickle')
(is_success, plotxy) = prediction.campaign_predict(
    [['xxx', 'SOMA'], '*', '*'], targeting_levels,
    file_name_1 + 'statistics_StarTree.pickle',
    file_name_2 + '_template_list.pickle', common_dir + '/GBDT_mean.pickle',
    common_dir + '/GBDT_var.pickle')

print targeting_levels

#----------------test---------------
mean_sq_error = 0
var_sq_error = 0
total_count = 0
r_mean_sq_error = 0  #relative