signal_reweight_data_nan_s_dropped = signal_reweight_data_s_dropped.dropna(
    axis=0)
background_reweight_data = reweight_data_small.where(
    reweight_data['Signal'] == 0)
background_reweight_data_s_dropped = background_reweight_data.drop(['Signal'],
                                                                   axis=1)
background_reweight_data_nan_s_dropped = background_reweight_data_s_dropped.dropna(
    axis=0)

ratio = len(signal_reweight_data_nan_s_dropped) / len(
    background_reweight_data_nan_s_dropped)

reweighter = GBReweighter(n_estimators=40)
reweighter.fit(background_reweight_data_nan_s_dropped,
               signal_reweight_data_nan_s_dropped)
weights = reweighter.predict_weights(background_reweight_data_nan_s_dropped)
print(weights)

total_weights = ratio * weights / np.mean(weights)

#reweighted_background = background_reweight_data.multiply(weights, axis=0)

fig_weight, ax_weight = plt.subplots(3, 2, figsize=(15, 15))

ax_weight[0, 0].hist(signal_reweight_data_nan_s_dropped.p_et_calo.ravel(),
                     bins=50,
                     range=(0, 100000),
                     color='r',
                     alpha=0.5,
                     label="Signal")
ax_weight[0, 0].hist(background_reweight_data_nan_s_dropped.p_et_calo.ravel(),
Exemplo n.º 2
0
# Read the decay times from the LHCb simulation - I've serialised it here
print("reading pickle")
with open("mc_times.pickle", "rb") as f:
    mc_times = pickle.load(f)

# Generate some random numbers from an exponential distribution with the right decay constant
d_lifetime_ps = 0.49
N = len(mc_times)
print("gen times")
exp_times = np.random.exponential(d_lifetime_ps, N)

mc_train, mc_test, model_train, model_test = train_test_split(
    mc_times, exp_times)

bdt = GBReweighter()
print("Training bdt")
bdt.fit(original=model_train, target=mc_train)
weights = bdt.predict_weights(model_test)

kw = {"bins": np.linspace(0.0, 9.0, 100), "alpha": 0.3, "density": True}
plt.figure(figsize=(12.0, 9.0))

plt.hist(mc_test, label="Original", **kw)
plt.hist(model_test, label="Target", **kw)
plt.hist(model_test, label="Target Weighted", weights=weights, **kw)
plt.legend()

plt.xlabel("Time /ps")
plt.ylabel("Counts")
plt.savefig("mwe.png")
Exemplo n.º 3
0
                        test_proba,
                        sample_weight=subtest.weight)[:2]
    auc_pre = roc_auc_score(subtest[['isE']],
                            test_proba,
                            sample_weight=subtest.weight)

    #run reweighting -- not working on MC for some reason
    reweighter = GBReweighter(n_estimators=1 if debug else 30,
                              max_depth=4,
                              learning_rate=0.1)
    reweighter.fit(subtrain[subtrain.isE == 1][reweight_feats], subtrain[
        subtrain.isE == 0][reweight_feats])  #make electrons look like tracks

    #run weights FOR EVERYTHING!
    for df in [data, subtrain, subtest]:
        weights = reweighter.predict_weights(
            df[df.isE == 1][reweight_feats])  #1/w to be used
        df.loc[df.isE == 1, 'weight'] = weights

    #save reweighter
    joblib.dump(reweighter, reweight_model_file, compress=True)

    # Check that sepratation vanishes
    post_separation = GradientBoostingClassifier(
        n_estimators=1 if debug else 50,
        max_depth=4,
        random_state=42,
        verbose=1)
    post_separation.fit(subtrain[reweight_feats],
                        subtrain[['isE']],
                        sample_weight=subtrain.weight)
    test_proba = post_separation.predict_proba(subtest[reweight_feats])[:, 1]
Exemplo n.º 4
0
                pickle.dump(pickle_file, handle)
            print( "GBReweighter saved to {}".format(options.Save) )





        #Plotting if applicable
        if options.Plots != False:
            #Prediction of the training data for comparison
            print( "Predicting with the just trained model" )
            original_weight_distribution_test = original_test[original_weights] if original_weights != None else np.ones(len(original_test))
            original_weight_distribution_train = original_train[original_weights] if original_weights != None else np.ones(len(original_train))
            target_weight_distribution = target[target_weights] if target_weights != None else np.ones(len(target))

            calculated_weights_test = gb.predict_weights(original_test[reweighting_branches], original_weight_distribution_test)
            calculated_weights_train = gb.predict_weights(original_train[reweighting_branches], original_weight_distribution_train)

            #Some output about the calculated weights
            #normalise to number of original entries
            n_entries_test = len(original_test)
            calculated_weights_scaled_test = calculated_weights_test * n_entries_test / np.sum(calculated_weights_test)
            print( "Entries in original (test-)dataset: {}".format(n_entries_test) )
            print( "Sum of calculated weights: {}").format(np.sum(calculated_weights_scaled_test) )
            max_weight = np.max(calculated_weights_scaled_test)
            print( "Maximum weight: {} ({:%} of entries)".format(max_weight, max_weight/n_entries_test) )

            #Normalisation for training as well
            calculated_weights_scaled_train = calculated_weights_train * len(original_train) / np.sum(calculated_weights_train)