def gbreweight(weighttree, originaltree, name, variables, n=None): '''Use Hep_ml GBReweighter to calculate weights for weighttree to match originaltree in the given variables. Adds a friend with branch named 'name' of length 2: the first element is the calculated weight, the second is the product of that weight with any existing weight used for the weighttree (from the selection).''' from hep_ml.reweight import GBReweighter originalweights, originalvals = get_weights_and_vals( originaltree, variables, n) weightedweights, weightedvals = get_weights_and_vals( weighttree, variables, n) weighter = GBReweighter() print('Fit GBReweighter', name) weighter.fit(original=originalvals, original_weight=originalweights, target=weightedvals, target_weight=weightedweights) weight = weighttree.selection_functor() vals = weighttree.get_functor_list(variables) def get_weight(): _w = weighter.predict_weights([vals()])[0] return [_w, _w * weight()] print('Add weights for GBReweighter', name) weighttree.add_friend_tree(name, {name: dict(function=get_weight, length=2)})
def trainBDT(self, targetSample): originDF = self.getTrainDataFrame() targetDF = targetSample.getTrainDataFrame() if self.observables.keys() != targetSample.observables.keys(): print 'Error observables for target and origin data sets do not match. Exiting...' print 'Origin:', self.observables.keys() print 'Target:', targetSample.observables.keys() exit(-1) originPreWeights = originDF["preweight"] targetPreWeights = targetDF["preweight"] originDF = originDF[self.observables.keys()] targetDF = targetDF[targetSample.observables.keys()] reweighter = GBReweighter(n_estimators=200, learning_rate=.1, max_depth=3, min_samples_leaf=1000, loss_regularization=1.0) reweighter.fit(original=originDF, target=targetDF, original_weight=originPreWeights, target_weight=targetPreWeights) with open(self.gbrwPath(), "wb") as f: pickle.dump(reweighter, f)
def train_reweighter(): extra_vars = [ gcm().ltime_var ] all_vars = gcm().phsp_vars + extra_vars columns = [v.var for v in all_vars if 'phi' not in v.var] columns += ['cosphi', 'sinphi'] # Current mode stuff data = gcm().get_data([f.var for f in extra_vars]) add_variables.append_phsp(data) data['cosphi'] = np.cos(data.phi1) data['sinphi'] = np.sin(data.phi1) df_sel = final_selection.get_final_selection() df_sel &= selection.delta_mass_signal_region() gen = get_model() gen['cosphi'] = np.cos(gen.phi1) gen['sinphi'] = np.sin(gen.phi1) limits = {v.var: v.binning[1:] for v in all_vars} limits['cosphi'] = (-1., 1) limits['sinphi'] = (-1., 1) for c in columns: mi, ma = limits[c] data[c] = (data[c] - mi) / (ma - mi) + 2. gen[c] = (gen[c] - mi) / (ma - mi) + 2. log.info('Training BDT reweighter for {}'.format(', '.join(columns))) reweighter = GBReweighter(n_estimators=300, max_depth=5, learning_rate=0.2) reweighter.fit(original=gen[columns].sample(n=250000), target=data[columns][df_sel].sample(n=250000)) bdt_utils.dump_reweighter(reweighter)
def test_folding_gb_reweighter(): reweighter = FoldingReweighter(GBReweighter(n_estimators=20, max_depth=2, learning_rate=0.1), n_folds=3) check_reweighter(n_dimensions=2, n_samples=200000, reweighter=reweighter, folding=True)
log.info(f" Background: {trainNBkg} ({( (trainNBkg) / (trainNSig+trainNBkg) )*100:.2f}%)") log.info(f" Shape of validation set: {np.shape(data_train[validMask])}") log.info(f" Signal: {validNSig} ({( (validNSig) / (validNSig+validNBkg) )*100:.2f}%)") log.info(f" Background: {validNBkg} ({( (validNBkg) / (validNSig+validNBkg) )*100:.2f}%)") #============================================================================ # Reweigh #============================================================================ log.info(f"Reweigh background data using GBReweighter on training set") t = time() # Create weight estimators and fit them to the data reweighterEst10 = GBReweighter(n_estimators=10, #learning_rate=params['learning_rate'], max_depth=5, #min_samples_leaf=params['min_samples_leaf'], #loss_regularization=params['loss_regularization'] ) reweighterEst20 = GBReweighter(n_estimators=20, #learning_rate=params['learning_rate'], max_depth=5, #min_samples_leaf=params['min_samples_leaf'], #loss_regularization=params['loss_regularization'] ) reweighterEst40 = GBReweighter(n_estimators=40, #learning_rate=params['learning_rate'], max_depth=5, #min_samples_leaf=params['min_samples_leaf'], #loss_regularization=params['loss_regularization'] ) log.info(f"Fitting weights...")
def test_gb_reweighter_2d_new(): reweighter = GBReweighter(max_depth=3, n_estimators=30, learning_rate=0.3, gb_args=dict(subsample=0.3)) check_reweighter(n_dimensions=2, n_samples=200000, reweighter=reweighter)
def test_gb_reweighter_1d(): reweighter = GBReweighter(n_estimators=100, max_depth=2) check_reweighter(n_dimensions=1, n_samples=100000, reweighter=reweighter)
signal_reweight_data = reweight_data_small.where(reweight_data['Signal'] == 1) signal_reweight_data_s_dropped = signal_reweight_data.drop(['Signal'], axis=1) signal_reweight_data_nan_s_dropped = signal_reweight_data_s_dropped.dropna( axis=0) background_reweight_data = reweight_data_small.where( reweight_data['Signal'] == 0) background_reweight_data_s_dropped = background_reweight_data.drop(['Signal'], axis=1) background_reweight_data_nan_s_dropped = background_reweight_data_s_dropped.dropna( axis=0) ratio = len(signal_reweight_data_nan_s_dropped) / len( background_reweight_data_nan_s_dropped) reweighter = GBReweighter(n_estimators=40) reweighter.fit(background_reweight_data_nan_s_dropped, signal_reweight_data_nan_s_dropped) weights = reweighter.predict_weights(background_reweight_data_nan_s_dropped) print(weights) total_weights = ratio * weights / np.mean(weights) #reweighted_background = background_reweight_data.multiply(weights, axis=0) fig_weight, ax_weight = plt.subplots(3, 2, figsize=(15, 15)) ax_weight[0, 0].hist(signal_reweight_data_nan_s_dropped.p_et_calo.ravel(), bins=50, range=(0, 100000), color='r',
verbose=1) #set_trace() pre_separation.fit(subtrain[reweight_feats], subtrain[['isE']], sample_weight=subtrain.weight) test_proba = pre_separation.predict_proba(subtest[reweight_feats])[:, 1] roc_pre = roc_curve(subtest[['isE']], test_proba, sample_weight=subtest.weight)[:2] auc_pre = roc_auc_score(subtest[['isE']], test_proba, sample_weight=subtest.weight) #run reweighting -- not working on MC for some reason reweighter = GBReweighter(n_estimators=1 if debug else 30, max_depth=4, learning_rate=0.1) reweighter.fit(subtrain[subtrain.isE == 1][reweight_feats], subtrain[ subtrain.isE == 0][reweight_feats]) #make electrons look like tracks #run weights FOR EVERYTHING! for df in [data, subtrain, subtest]: weights = reweighter.predict_weights( df[df.isE == 1][reweight_feats]) #1/w to be used df.loc[df.isE == 1, 'weight'] = weights #save reweighter joblib.dump(reweighter, reweight_model_file, compress=True) # Check that sepratation vanishes post_separation = GradientBoostingClassifier(
# Read the decay times from the LHCb simulation - I've serialised it here print("reading pickle") with open("mc_times.pickle", "rb") as f: mc_times = pickle.load(f) # Generate some random numbers from an exponential distribution with the right decay constant d_lifetime_ps = 0.49 N = len(mc_times) print("gen times") exp_times = np.random.exponential(d_lifetime_ps, N) mc_train, mc_test, model_train, model_test = train_test_split( mc_times, exp_times) bdt = GBReweighter() print("Training bdt") bdt.fit(original=model_train, target=mc_train) weights = bdt.predict_weights(model_test) kw = {"bins": np.linspace(0.0, 9.0, 100), "alpha": 0.3, "density": True} plt.figure(figsize=(12.0, 9.0)) plt.hist(mc_test, label="Original", **kw) plt.hist(model_test, label="Target", **kw) plt.hist(model_test, label="Target Weighted", weights=weights, **kw) plt.legend() plt.xlabel("Time /ps") plt.ylabel("Counts") plt.savefig("mwe.png")
target = read_root(options.Rootfiles[1], columns=all_branches_target) #Split original distribution if -t flag is given if options.TrainTest: print "Performing train-test split" original_train, original_test = train_test_split(original) else: original_train = original original_test = original original_weight_distribution_train = original_train[original_weights] if original_weights != None else None target_weight_distribution = target[target_weights] if target_weights != None else None #Start the training gb = GBReweighter(**gb_settings) print( "Doing training of GBReweighter..." ) print( "Re-weighting variables: {}".format(reweighting_branches) ) begin = time.time() gb_output = gb.fit(original_train[reweighting_branches], target[reweighting_branches], original_weight = original_weight_distribution_train, target_weight = target_weight_distribution) print( "Settings: {}".format(gb_output) ) print( "Training of GBReweighter took {:.2f} seconds".format(time.time()-begin) ) #Save the classifier as pickle if applicable if options.Save:
how='right').drop(['vspt', 'uid', 'Sample', 'Segment Knicks'], axis=1) # CREATING SAMPLE WEIGHTS # (https://arogozhnikov.github.io/hep_ml/reweight.html) res_cols = list(tm_revenue.reset_index().columns) resampling_b = datab[['Segment Knicks'] + res_cols] resampling_h = datah[['Segment Rangers'] + res_cols] sampleb = resampling_b.dropna(subset=['Segment Knicks']).drop( ['Segment Knicks'], axis=1).set_index('email') fullb = resampling_b[pd.isnull(resampling_b['Segment Knicks'])].drop( ['Segment Knicks'], axis=1).set_index('email') sampleh = resampling_h.dropna(subset=['Segment Rangers']).drop( ['Segment Rangers'], axis=1).set_index('email') fullh = resampling_h[pd.isnull(resampling_h['Segment Rangers'])].drop( ['Segment Rangers'], axis=1).set_index('email') reweighter = GBReweighter() sampleb['weight'] = reweighter.fit( original=sampleb, target=fullb).predict_weights(sampleb).round(3) sampleh['weight'] = reweighter.fit( original=sampleh, target=fullh).predict_weights(sampleh).round(3) # LOGIT MODELING # modeling_bball = pd.merge(data[data['vspt'] == 'basketball'], sampleb['weight'].reset_index(), on='email').drop( ['vspt', 'Sample', 'email', 'Segment Rangers'], axis=1).set_index('uid') modeling_hockey = pd.merge(data[data['vspt'] == 'hockey'], sampleh['weight'].reset_index(), on='email').drop( ['vspt', 'Sample', 'email', 'Segment Knicks'],
'loss_regularization': 5.0 }] log.info(f"Regular reweights") for iWeight, weightName in enumerate(reweightNames): t = time() # Print parameters log.info(f"Parameters for GBReweighter:") params = reweightParams[iWeight] for param in params: log.info(f" {param} : {params[param]}") # Setup reweighter: https://arogozhnikov.github.io/hep_ml/reweight.html# reweighter = GBReweighter( n_estimators=params['n_estimators'], learning_rate=params['learning_rate'], max_depth=params['max_depth'], min_samples_leaf=params['min_samples_leaf'], loss_regularization=params['loss_regularization']) # Create weight estimators and fit them to the data log.info(f"Fitting weights...") reweighter.fit( original=np.array([ data_train['eta'][trainMask & (data_train["label"] < 0.5)], data_train['pt'][trainMask & (data_train["label"] < 0.5)], data_train['invM'][trainMask & (data_train["label"] < 0.5)], data_train['correctedScaledActualMu'][ trainMask & (data_train["label"] < 0.5)] ]).T, target=np.array([ data_train['eta'][trainMask & (data_train["label"] >= 0.5)],