def test_xgboost_random_states(): X, y, weights = generate_classification_data(n_classes=2, distance=5) for random_state in [ 145, None, check_random_state(None), check_random_state(145) ]: clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf1.fit(X, y) clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf2.fit(X, y) if isinstance(random_state, numpy.random.RandomState): assert not numpy.allclose( clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state) else: assert numpy.allclose( clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
def test_xgboost_random_states(): X, y, weights = generate_classification_data(n_classes=2, distance=5) for random_state in [145, None, check_random_state(None), check_random_state(145)]: clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf1.fit(X, y) clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf2.fit(X, y) if isinstance(random_state, numpy.random.RandomState): assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state) else: assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
def test_xgboost_feature_importance(): X, y, weights = generate_classification_data(n_classes=2, distance=5) clf = XGBoostClassifier(n_estimators=1, max_depth=1) clf.fit(X, y) importances = clf.get_feature_importances() original_features = set(X.columns) importances_features = set(importances.index) print(original_features, importances_features) assert original_features == importances_features, 'feature_importances_ return something wrong' assert len(original_features) == len(clf.feature_importances_)
def test_feature_importances(): clf = XGBoostClassifier() X, y, sample_weight = generate_classification_data() clf.fit(X, y, sample_weight=sample_weight) # checking feature importance (three ways) res_default = clf.xgboost_classifier.get_fscore() res2 = clf._get_fscore() res3 = clf.feature_importances_ assert res_default == res2, res_default for i, val in enumerate(res3): if val > 0.0: assert val == res_default['f' + str(i)]
def test_xgboost_works_with_different_dtypes(): dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32'] for dtype in dtypes: X, y, weights = generate_classification_data(n_classes=2, distance=5) clf = XGBoostClassifier(n_estimators=10) clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype)) probabilities = clf.predict_proba(X.astype(dtype)) # testing single pandas.DataFrame with different dtypes X, y, weights = generate_classification_data(n_classes=2, distance=5) import pandas X = pandas.DataFrame() for dtype in dtypes: X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype) clf = XGBoostClassifier(n_estimators=10) clf.fit(X, y, sample_weight=weights) probabilities = clf.predict_proba(X)
if ii==1 : train= trainFeaturesObvious Var='Mass' if ii==2 : train= trainFeaturesHH Var='HH' xgb = XGBoostClassifier(train) #, original = xgboriginal.XGBClassifier(train) """ n_estimators = 200, eta = 0.1, max_depth = 7, subsample = 0.9, colsample = 0.6) """ xgb.fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64))) prob = xgb.predict_proba(valdatasetmix[train].astype(np.float64) ) if ii==0 : reportAll = xgb.test_on(traindatasetmix[trainFeaturesplot].astype(np.float64), traindatasetmix.target.astype(np.bool)) if ii==1 : reportObvious = xgb.test_on(traindatasetmix[trainFeaturesObvious].astype(np.float64), traindatasetmix.target.astype(np.bool)) if ii==2 : reportHH = xgb.test_on(traindatasetmix[trainFeaturesHH].astype(np.float64), traindatasetmix.target.astype(np.bool)) # compatible with lustr/lxplus #features = ['costhst_DiJets[0]_HH', 'costhst_Jets[0]_DiJets[0]', 'costhst_Jets[2]_DiJets[1]', 'CSV3', 'CSV4', 'Jets[0].eta()', 'Jets[1].eta()', 'Jets[2].eta()', 'Jets[3].eta()', 'HT_other_jets'] #dataout = traindatasetmix.rename(index=str, columns={'HHCost':'costhst_DiJets[0]_HH', 'H1Costbb':'costhst_Jets[0]_DiJets[0]', 'H2Costbb':'costhst_Jets[2]_DiJets[1]', 'CSV3':'CSV3', 'CSV4':'CSV4', 'jeteta1':'Jets[0].eta()', 'jeteta2':'Jets[1].eta()', 'jeteta3':'Jets[2].eta()', 'jeteta4':'Jets[3].eta()', 'jetHTrest':'HT_other_jets'}) #param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } #num_round = 2 #original = xgboriginal.XGBClassifier(param, train, num_round).fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64))) #proboriginal = original.predict_proba(valdatasetmix[train].astype(np.float64)) #print proboriginal #joblib.dump(original, outputCentral+"_"+Var+'.pkl') joblib.dump(prob, outputCentral+"_"+Var+'.pkl') #pickle.dump(prob, outputCentral+"_"+Var+'.pkl')