def reweight_evt_dics(evt_dic_mc, evt_dic_rd): """ reweigh mc event dictionary for training """ evt_dic_mc_copy = evt_dic_mc.copy() evt_dic_rd_copy = evt_dic_rd.copy() shape_data(evt_dic_mc_copy) shape_data(evt_dic_rd_copy) flatten_feature(evt_dic_mc_copy, 'track') flatten_feature(evt_dic_rd_copy, 'track') mc_array = np.c_[evt_dic_mc_copy['track'], evt_dic_mc_copy['event']] mc_train, mc_test = train_test_split(mc_array, test_size=0.4) rd_array = np.c_[evt_dic_rd_copy['track'], evt_dic_rd_copy['event']] reweighter = reweight.GBReweighter(n_estimators=200, learning_rate=0.1, max_depth=3, min_samples_leaf=30) reweighter.fit(mc_array, rd_array) gb_weights_test = reweighter.predict_weights(mc_array) return gb_weights_test
def reweightermodel(ioriginal, itarget, ioriginal_weights, itarget_weights, args): numpy.random.seed(args[5]) #Fix any random seed using numpy arrays reweighter_base = reweight.GBReweighter(n_estimators=args[0], learning_rate=args[1], max_depth=args[2], min_samples_leaf=args[3], gb_args={'subsample': args[4]}) reweighter = reweight.FoldingReweighter(reweighter_base, random_state=args[5], n_folds=3, verbose=False) reweighter.fit(ioriginal, itarget, ioriginal_weights, itarget_weights) return reweighter
## 'original_weights_test.png') ## ### Gradient boosted Reweighter ##reweighter = reweight.GBReweighter(n_estimators=50, learning_rate=0.1, ## max_depth=3, min_samples_leaf=1000, ## gb_args={'subsample': 0.4}) ##reweighter.fit(original_train, target_train) ##gb_weights_test = reweighter.predict_weights(original_test) ## ### Validate reweighting rule on the test part comparing 1d projections ##draw_distributions(original_test, target_test, gb_weights_test, ## 'gb_weights_test.png') ## # Folding Reweighter # define base reweighter reweighter_base = reweight.GBReweighter(n_estimators=50, learning_rate=0.1, max_depth=2, min_samples_leaf=1000, gb_args={'subsample': 0.4}) reweighter = reweight.FoldingReweighter(reweighter_base, n_folds=2) # not need to divide data into train/test parts reweighter.fit(original, target, target_weight=target_sWeights) folding_weights = reweighter.predict_weights(original) # cast the array into float cast_target_sWeights = target_sWeights.astype(float) draw_distributions_weighted(original, target, folding_weights, cast_target_sWeights, 'FoldingReweight.png') #draw_distributions(original, target, folding_weights, # 'FoldingReweight.png')
# print 'After binned re-weighting' # draw_distributions(original_test.iloc[:,:-1], target_test.iloc[:,:-1], bins_weights_test, target_weights_test) # ********************************************** # *********Gradient Boosted Re-weighting******** # This is currently the best config for Run1 # reweighter = reweight.GBReweighter(n_estimators=200, learning_rate=0.1, # max_depth=3, min_samples_leaf=50, # gb_args={'subsample': 0.2, # 'random_state': 42}) reweighter = reweight.GBReweighter(n_estimators=100, learning_rate=0.2, max_depth=4, min_samples_leaf=50, gb_args={ 'subsample': 0.5, 'random_state': 42 }) # reweighter = reweight.GBReweighter(n_estimators=50, learning_rate=0.1, # max_depth=3, min_samples_leaf=100, # gb_args={'subsample': 0.5, # 'random_state': 42}) # reweighter.fit(original_train.iloc[:, :-1], target_train.iloc[:, :-1], # original_weights_train, target_weights_train) reweighter.fit(original.iloc[:, :-1], target.iloc[:, :-1], original_weights, target_weights) # gb_weights_test = reweighter.predict_weights(original_test.iloc[:, :-1])
#bins_weights = bins_reweighter.predict_weights(original) ## validate reweighting rule on the test part comparing 1d projections #draw_distributions(original, target, bins_weights, target_weights) ##====================gb reweighter reweight!!! """ the following set are used for the low statistic case feel free to increase the n_estimators (number of trees) and min_samples_leaf (minimal number of evnts in the leaf) if you have enough statistics usually set to be n_estimators = 200 , min_samples_leaf=1000 ; """ reweighter = reweight.GBReweighter(n_estimators=70, learning_rate=0.1, max_depth=3, min_samples_leaf=100, gb_args={'subsample': 0.7}) reweighter.fit(original, target, original_weights, target_weights) gb_weights_test = reweighter.predict_weights(original) gb_weights_used = reweighter.predict_weights(used) print(type(gb_weights_used)) #reweighting done #show the weight results #validate reweighting vars on the test part comparing 1d projections draw_distributions(original, target, gb_weights_test, target_weights) #saving weight to root
def reweightermodel(original,target,original_weights,target_weights,args): reweighter_base = reweight.GBReweighter(n_estimators=args[0], learning_rate=args[1], max_depth=args[2], min_samples_leaf=args[3],gb_args={'subsample': args[4]}) reweighter = reweight.FoldingReweighter(reweighter_base,random_state=2019, n_folds=2, verbose=True) reweighter.fit(original,target,original_weights,target_weights) return reweighter
def run(): args = getArgs().parse_args() ## ## TRAINING + VALIDATION OF BDT ## original = readData(args.original) target = readData(args.target) original_weights = np.ones(len(original)) target_weights = np.ones(len(target)) # divide original samples into training and test parts original_train, original_test = train_test_split(original) # divide target samples into training and test parts target_train, target_test = train_test_split(target) original_weights_train = np.ones(len(original_train)) original_weights_test = np.ones(len(original_test)) columns = [ 'hs_pt', 'wp_pt', 'wm_pt', 'met', 'hs_abseta', 'wp_abseta', 'wm_abseta', 'dRWW' ] print('train', len(original_train)) print('test', len(original_test)) # create output folder try: makedirs(args.outputdir) except OSError: pass # draw full distributions drawDistributions(original, target, original_weights, columns, join(args.outputdir, 'total.png')) # draw train distributions drawDistributions(original_train, target_train, original_weights_train, columns, join(args.outputdir, 'train.png')) # draw test distributions drawDistributions(original_test, target_test, original_weights_test, columns, join(args.outputdir, 'test_before.png')) # gradient boosted reweighting reweighter = reweight.GBReweighter(n_estimators=200, learning_rate=0.1, max_depth=4, min_samples_leaf=1000, gb_args={'subsample': 0.4}) reweighter.fit(original_train, target_train) gb_weights_test = reweighter.predict_weights(original_test) # validate reweighting rule on the test part comparing 1d projections drawDistributions(original_test, target_test, gb_weights_test, columns, join(args.outputdir, 'test_bdt.png')) ## ## REWEIGHTING THE SIMULATED SIGNAL SAMPLE ## if args.result: result = readData(args.result, clean=False) result_weights = np.ones(len(result)) print('result', len(result)) # reweight result gb_weights_result = reweighter.predict_weights(result) # plot result comparing 1d projections drawDistributions(result, target_test, result_weights, columns, join(args.outputdir, 'result_before.png')) drawDistributions(result, target_test, gb_weights_result, columns, join(args.outputdir, 'result_bdt.png'))
# nrows=1, ncols=1, hist_settings=hist_settings) # # draw_distributions('n100gbr4_validate_KuPT.png', # [columns[3]], original_test, target_test, gb_weights_test, # filename_as_title=True, # # yscale=('log',), # nrows=1, ncols=1, hist_settings=hist_settings) ###################### # Folding Reweighter # ###################### # Define base reweighter reweighter_base = reweight.GBReweighter( n_estimators=120, learning_rate=0.1, max_depth=3, min_samples_leaf=5000, ) reweighter = reweight.FoldingReweighter(reweighter_base, n_folds=2) # Not need to divide data into train/test parts reweighter.fit(original, target, target_weight=target_weights) # Prediect weights for the input file folding_weights = reweighter.predict_weights(toReweight) draw_distributions( 'GBR4_validate.png', columns, toReweight,
weightBase[i] = probabilityToWeight(baseProba[i][:,1]) weightLbfgs[i] = probabilityToWeight(lbfgsProba[i][:,1]) weightAda[i] = probabilityToWeightAda(adaProba[i][:,1]) weightSgd[i] = probabilityToWeight(sgdProba[i][:,1]) """# Gradient Boosting Reweighter""" #Gradient boosted reweighter gb_weights_test = np.empty(sets,dtype=object) for i in range(sets): #Can mess with these paramaters to optimse performance gb_reweighter = reweight.GBReweighter(n_estimators=100, learning_rate=0.1, max_depth=32, min_samples_leaf=500, gb_args={'subsample': 0.4, 'max_features' : 6, 'min_samples_split' : 201}) gb_reweighter.fit(original_train[i], target_train[i]) gb_weights_test[i] = gb_reweighter.predict_weights(original_test[i]) #Check weighted distributions on the test splits draw_distributions(original_test[1], target_test[1], gb_weights_test[1]) #Folding Reweighter folding_weights = np.empty(sets,dtype=object) for i in range(sets): #Gradient boosted decision tree as base reweighter_gb = reweight.GBReweighter( learning_rate=0.1, n_estimators=64, max_depth=32, min_samples_leaf=200, gb_args={'subsample': 0.4,})