Exemplo n.º 1
0
def create_test_data(X,
                     y,
                     directory,
                     n_dump_samples=100,
                     objective="binary:logitraw"):
    if not os.path.exists(directory):
        os.makedirs(directory)

    model = XGBClassifier(n_estimators=100, max_depth=7,
                          objective=objective).fit(X, y)

    model._Booster.dump_model(os.path.join(directory, "model.txt"))
    model._Booster.save_model(os.path.join(directory, "model.bin"))
    feature_names = [("f" + str(i), "F") for i in range(len(y))]
    xgboost2tmva.convert_model(model._Booster.get_dump(), feature_names,
                               os.path.join(directory, "model.xml"))

    X_dump = X[:n_dump_samples]

    preds_dump = model.predict_proba(X_dump)

    if preds_dump.shape[1] == 2:
        preds_dump = preds_dump[:, 1]

    pd.DataFrame(X_dump).to_csv(os.path.join(directory, "X.csv"), **csv_args)
    pd.DataFrame(preds_dump).to_csv(os.path.join(directory, "preds.csv"),
                                    **csv_args)
"""
cls = xgb.XGBClassifier()
cls.fit(
    traindataset[trainVars()].values,
    traindataset.target.astype(np.bool),
    #sample_weight= (traindataset[weights].astype(np.float64)),
    #eval_set=[(traindataset[trainVars()].values,  traindataset.target.astype(np.bool),traindataset[weights].astype(np.float64)),
    #(valdataset[trainVars()].values,  valdataset.target.astype(np.bool), valdataset[weights].astype(np.float64))] ,
    #eval_metric='logloss'
)
#print (cls.evals_result())
#print (cls.evals_result()['validation_0']['logloss'])
#"""
model = cls.booster().get_dump(
    fmap='', with_stats=False)  #.get_dump() #pickle.dumps(cls)
xgboost2tmva.convert_model(model, trainVars(),
                           "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml")
# xmllint --format TMVABDT_2lss_1tau_XGB_wMEMallVars.xml
#skTMVA.convert_bdt_sklearn_tmva(cls, trainVars(), "TMVABDT_2lss_1tau_XGB_wMEMallVars.xml")
#sklearn_to_tmva.xgbr_to_tmva(cls,evals_result,data[trainVars()],trainVars(),"TMVABDT_2lss_1tau_XGB_wMEMallVars.xml",coef=2)
# run cross validation
print("XGBoost trained")
proba = cls.predict_proba(traindataset[trainVars()].values)
fpr, tpr, thresholds = roc_curve(traindataset["target"], proba[:, 1])
train_auc = auc(fpr, tpr, reorder=True)
print("XGBoost train set auc - {}".format(train_auc))
proba = cls.predict_proba(valdataset[trainVars()].values)
fprt, tprt, thresholds = roc_curve(valdataset["target"], proba[:, 1])
test_auct = auc(fprt, tprt, reorder=True)
print("XGBoost test set auc - {}".format(test_auct))
""" 
sklearn_to_tmva.gbr_to_tmva(
Exemplo n.º 3
0
    plt.xlim([-1, len(features)])
    plt.savefig('vriable_importance_15032019_nTree260_endcap.png')


variable_importance(model, input_vars)

##################################################################################################################################

# convert xgboost to TMVA weights

import tempfile
feature_map = tempfile.NamedTemporaryFile(suffix=".txt")
for index, varname in enumerate(input_vars):
    print >> feature_map, index, varname, "q"

feature_map.flush()

import re

tmva_output_fname = re.sub("\\.pkl$", ".xml", model_fname)

model_dump = model.get_booster().get_dump(fmap=feature_map.name)
xgboost2tmva.convert_model(model_dump,
                           input_variables=[(input_var, 'F')
                                            for input_var in input_vars],
                           output_xml=tmva_output_fname,
                           pretty=True)

print "Wrote", tmva_output_fname
###############################################################################################################################
Exemplo n.º 4
0
sys.setrecursionlimit(1000000)

X, y = make_classification(n_samples=10000,
                           n_features=5,
                           random_state=42,
                           n_classes=2,
                           weights=[0.5])

model = XGBClassifier(n_estimators=1000).fit(X, y)

model._Booster.dump_model("model.txt")
model._Booster.save_model("model.bin")

# export to TMVA-style XML file
input_variables = [("f" + str(i), "F") for i in range(5)]
xgboost2tmva.convert_model(model._Booster.get_dump(), input_variables,
                           "model.xml")

# export to hardcoded C
code = m2c.export_to_c(model)
with open("model.c", "w") as c_file:
    c_file.write(code)

X_test = np.random.uniform(-5, 5, size=(100000, 5))

start_time = time.time()

preds = model.predict_proba(X_test)[:, 1]
print(np.mean(preds))

elapsed_secs = time.time() - start_time
Exemplo n.º 5
0
fpr, tpr, thresholds = roc_curve(traindataset["target"], proba[:, 1])
train_auc = auc(fpr, tpr, reorder=True)
print("XGBoost train set auc - {}".format(train_auc))
proba = cls.predict_proba(valdataset[trainVars(False)].values)
fprt, tprt, thresholds = roc_curve(valdataset["target"], proba[:, 1])
test_auct = auc(fprt, tprt, reorder=True)
print("XGBoost test set auc - {}".format(test_auct))
if options.doXML == True:
    print("Date: ", time.asctime(time.localtime(time.time())))
    pklpath = channel + "/" + channel + "_XGB_" + trainvar + "_" + bdtType + ".pkl"
    pickle.dump(cls, open(pklpath, 'wb'))
    # save the model in file 'xgb.model.dump'
    model = cls.booster().get_dump(
        fmap='', with_stats=False)  #.get_dump() #pickle.dumps(cls)
    xmlfile = channel + "/" + channel + "_XGB_" + trainvar + "_" + bdtType + ".xml"
    xgboost2tmva.convert_model(model, trainVars(False), xmlfile)
    print xmlfile + " written"
    print("Date: ", time.asctime(time.localtime(time.time())))
    """
	model2 = cls.booster().get_score(fmap='', importance_type='weight')
	#print json.dump(model2, ensure_ascii=False, sort_keys=True, indent=4, default=lambda x: None)
	with open(pklpath, 'rb') as fpkl, open('%s.json' % pklpath, 'w') as fjson:
		pkldata = pickle.load(fpkl)
		#model.save_model('0001.model')
		json.dump(pkldata, fjson, ensure_ascii=False, sort_keys=True, indent=4, default=lambda x: None)
	"""
    #print json.dumps(model, sort_keys=True)
    #parse in command line: xmllint --format TMVABDT_2lss_1tau_XGB_wMEMallVars.xml

##################################################
"""
Exemplo n.º 6
0
        df = df.query(cfg["selection_base"])
        df = df.query(cfg["trainings"][idname][training_bin]["cut"])
        df.eval("y = ({0}) + 2 * ({1}) - 1".format(cfg["selection_bkg"],
                                                   cfg["selection_sig"]),
                inplace=True)

        print("Running bayesian optimized training...")
        xgb_bo_trainer = XgbBoTrainer(data=df, X_cols=feature_cols, y_col="y")
        xgb_bo_trainer.run()

        print("Saving weight files...")
        tmvafile = join(out_dir, "weights.xml")
        xgboost2tmva.convert_model(
            xgb_bo_trainer.models["bo"]._Booster.get_dump(),
            input_variables=list(zip(feature_cols,
                                     len(feature_cols) * ["F"])),
            output_xml=tmvafile,
        )
        os.system("xmllint --format {0} > {0}.tmp".format(tmvafile))
        os.system("mv {0} {0}.bak".format(tmvafile))
        os.system("mv {0}.tmp {0}".format(tmvafile))
        os.system("cd " + out_dir + " && gzip -f weights.xml")

        print("Saving bayesian optimization results...")
        xgb_bo_trainer.get_results_df().to_csv(
            join(out_dir, "xgb_bo_results.csv"))

        print("Saving individual cv results...")
        if not os.path.exists(join(out_dir, "cv_results")):
            os.makedirs(join(out_dir, "cv_results"))
        for i, cvr in enumerate(xgb_bo_trainer.cv_results):