def sample_level_evaluation(samples_file_name,targeting_levels): ''' return:(rmse_mean,rmse_var,rmsre_mean,rmsre_var) ''' pred_result = [] for line in open(samples_file_name): line = line.strip('\n') line_array=line.strip('\n').split('\t') sample = line_array[0:-3] count = int(line_array[-3]) old_sum = float(line_array[-2]) sq_sum = float(line_array[-1]) (is_success,pred_mean,pred_var,plotxy)=prediction.sample_predict(sample,targeting_levels) if is_success == True: pred_result.append({}) pred_result[-1]['count'] = count pred_result[-1]['mean'] = float(old_sum)/count pred_result[-1]['var'] = float(sq_sum)/count - pred_result[-1]['mean']**2 #var = var if var > 10e-8 else 10e-8 pred_result[-1]['pred_mean'] = pred_mean pred_result[-1]['pred_var'] = pred_var #print mean,var,pred_mean,pred_var total_count = sum([d['count'] for d in pred_result]) rmse_mean = math.sqrt(sum([(float(d['count'])/total_count) * ((d['mean']-d['pred_mean'])**2) for d in pred_result])) rmse_var = math.sqrt(sum([(float(d['count'])/total_count) * ((d['var']-d['pred_var'])**2) for d in pred_result])) rmsre_mean = math.sqrt(sum([(float(d['count'])/total_count) * (((d['mean']-d['pred_mean'])/d['mean'])**2) for d in pred_result])) #第一种处理方法: #精度处理,这里处理是为了计算相对误差时出现零除异常 #for idx in range(0,len(pred_result)): # tmp_var = pred_result[idx]['var'] # pred_result[idx]['var'] = tmp_var if tmp_var > 10e-8 else 10e-8 #这种精度处理,会使得相对误差极大,因为又好多sample都是单一赢价 #第二种处理方法,直接删除0var值sample: idx=0 while idx < len(pred_result): #if pred_result[idx]['var'] == 0: #或者小于某个精度 if pred_result[idx]['var'] < 10e-4: #或者小于某个精度 del pred_result[idx] else: idx += 1 total_count = sum([d['count'] for d in pred_result]) rmsre_var = math.sqrt(sum([(float(d['count'])/total_count) * (((d['var']-d['pred_var'])/d['var'])**2) for d in pred_result])) return (rmse_mean,rmse_var,rmsre_mean,rmsre_var)
targeting_levels=targeting_levels_1 TemplateSelection.template_select(file_name_2+'expand_samples',file_name_1+'statistics_StarTree.pickle',ordered_selected_feat,selected_feat,10,file_name_2 + '_template_list.pickle',common_dir + '/TemplateSelection.log') # data_for_GBDT.generate(file_name_1+'statistics_StarTree.pickle',file_name_2 + '_template_list.pickle',file_name_2+'expand_samples','mean',2,10e-5,file_name_2 + 'GBDT_train_data_mean') data_for_GBDT.generate(file_name_1+'statistics_StarTree.pickle',file_name_2 + '_template_list.pickle',file_name_2+'expand_samples','var',2,10e-5,file_name_2 + 'GBDT_train_data_var') GBDT.GBDT(file_name_2 + 'GBDT_train_data_mean',[1,1,1,0],10,0.001,4,'mean',common_dir + '/GBDT_mean.pickle',common_dir + '/GBDT_mean.log') GBDT.GBDT(file_name_2 + 'GBDT_train_data_var',[1,1,1,0],10,0.001,4,'var',common_dir + '/GBDT_var.pickle',common_dir + '/GBDT_var.log') # print targeting_levels for idx in range(0,len(targeting_levels)): targeting_levels[idx].add('*') #--- test in t3 (is_success,pred_mean,pred_var,plotxy)=prediction.sample_predict(['*','*','*'],targeting_levels,file_name_1+'statistics_StarTree.pickle',file_name_2 + '_template_list.pickle',common_dir + '/GBDT_mean.pickle',common_dir + '/GBDT_var.pickle') (is_success,plotxy)=prediction.campaign_predict([['xxx','SOMA'],'*','*'],targeting_levels,file_name_1+'statistics_StarTree.pickle',file_name_2 + '_template_list.pickle',common_dir + '/GBDT_mean.pickle',common_dir + '/GBDT_var.pickle') print targeting_levels #----------------test--------------- mean_sq_error=0 var_sq_error =0 total_count =0 r_mean_sq_error=0#relative r_var_sq_error =0#relative for line in open(file_name_3 + 'statistics'): line_array=line.split('\t') sample = line_array[0:-3] count = int(line_array[-3])
file_name_2 + 'expand_samples', 'var', 2, 10e-5, file_name_2 + 'GBDT_train_data_var') GBDT.GBDT(file_name_2 + 'GBDT_train_data_mean', [1, 1, 1, 0], 10, 0.001, 4, 'mean', common_dir + '/GBDT_mean.pickle', common_dir + '/GBDT_mean.log') GBDT.GBDT(file_name_2 + 'GBDT_train_data_var', [1, 1, 1, 0], 10, 0.001, 4, 'var', common_dir + '/GBDT_var.pickle', common_dir + '/GBDT_var.log') # print targeting_levels for idx in range(0, len(targeting_levels)): targeting_levels[idx].add('*') #--- test in t3 (is_success, pred_mean, pred_var, plotxy) = prediction.sample_predict( ['*', '*', '*'], targeting_levels, file_name_1 + 'statistics_StarTree.pickle', file_name_2 + '_template_list.pickle', common_dir + '/GBDT_mean.pickle', common_dir + '/GBDT_var.pickle') (is_success, plotxy) = prediction.campaign_predict( [['xxx', 'SOMA'], '*', '*'], targeting_levels, file_name_1 + 'statistics_StarTree.pickle', file_name_2 + '_template_list.pickle', common_dir + '/GBDT_mean.pickle', common_dir + '/GBDT_var.pickle') print targeting_levels #----------------test--------------- mean_sq_error = 0 var_sq_error = 0 total_count = 0 r_mean_sq_error = 0 #relative