def get_native_parameters_test(): assert H2OXGBoostEstimator.available() is True ntrees = 17 # CPU Backend is forced for the results to be comparable h2oParamsS = {"ntrees":ntrees, "max_depth":4, "seed":1, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"} nrows = 1000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols-numCols trainFile = pyunit_utils.genTrainFrame(nrows, 0, enumCols=enumCols, enumFactors=factorL, miscfrac=0.1, randseed=17) print(trainFile) myX = trainFile.names y='response' myX.remove(y) h2oModelS = H2OXGBoostEstimator(**h2oParamsS) h2oModelS.train(x=myX, y=y, training_frame=trainFile) print(h2oModelS._model_json["output"]["native_parameters"].as_data_frame()) assert h2oModelS._model_json["output"]["native_parameters"]._table_header == u"Native XGBoost Parameters"
def interaction_constraint_test(): assert H2OXGBoostEstimator.available() is True # CPU Backend is forced for the results to be comparable h2o_params = { "tree_method": "exact", "seed": 123, "backend": "cpu", "ntrees": 5 } train = pyunit_utils.genTrainFrame(100, 10, enumCols=0, randseed=17) print(train) x = train.names y = 'response' x.remove(y) h2o_params["interaction_constraints"] = [["C1", "C2"], ["C3", "C4", "C5"]] model = H2OXGBoostEstimator(**h2o_params) model.train(x=x, y=y, training_frame=train) native_params = model._model_json["output"][ "native_parameters"].as_data_frame() print(native_params) constraints = (native_params[native_params['name'] == "interaction_constraints"])['value'].values[0] assert constraints == u'[[0,1],[2,3,4]]', "Constraints should be [[0,1],[2,3,4]] but it is:" + constraints
def get_native_parameters_test(): assert H2OXGBoostEstimator.available() is True # CPU Backend is forced for the results to be comparable h2oParamsS = { "tree_method": "exact", "seed": 123, "backend": "cpu", "ntrees": 5 } trainFile = pyunit_utils.genTrainFrame(100, 10, enumCols=0, randseed=17) print(trainFile) myX = trainFile.names y = 'response' myX.remove(y) h2oParamsS["monotone_constraints"] = {"C1": -1, "C3": 1, "C7": 1} h2oModelS = H2OXGBoostEstimator(**h2oParamsS) h2oModelS.train(x=myX, y=y, training_frame=trainFile) native_params = h2oModelS._model_json["output"][ "native_parameters"].as_data_frame() print(native_params) constraints = (native_params[native_params['name'] == "monotone_constraints"])['value'].values[0] assert constraints == u'(-1,0,1,0,0,0,1,0,0,0)'
def get_native_parameters_test(): assert H2OXGBoostEstimator.available() is True # CPU Backend is forced for the results to be comparable h2oParamsS = {"tree_method": "exact", "seed": 123, "backend": "cpu", "ntrees": 5} trainFile = pyunit_utils.genTrainFrame(100, 10, enumCols=0, randseed=17) print(trainFile) myX = trainFile.names y='response' myX.remove(y) h2oParamsS["monotone_constraints"] = { "C1": -1, "C3": 1, "C7": 1 } h2oModelS = H2OXGBoostEstimator(**h2oParamsS) h2oModelS.train(x=myX, y=y, training_frame=trainFile) native_params = h2oModelS._model_json["output"]["native_parameters"].as_data_frame() print(native_params) constraints = (native_params[native_params['name'] == "monotone_constraints"])['value'].values[0] assert constraints == u'(-1,0,1,0,0,0,1,0,0,0)'
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True runSeed = 1 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method":"exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 11 numCols = 5 enumCols = ncols-numCols responseL = 2 trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL) myX = trainFile.names y='response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=ntrees) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol)
def comparison_test(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 maxdepth = 5 nrows = 10000 ncols = 12 factorL = 20 numCols = 1 enumCols = ncols-numCols responseL = 4 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "tree_method": "exact", "backend":"cpu"} trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, responseLevel=responseL, miscfrac=0.01,randseed=dataSeed) myX = trainFile.names y='response' myX.remove(y) newNames = [] for ind in range(0, len(myX)): myX[ind] = myX[ind]+"_"+str(ind) # avoid duplicated column names newNames.append(myX[ind]) newNames.append(y) trainFile.set_names(newNames) h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oPredictD = h2oModelD.predict(trainFile) # derive native XGBoost parameter and DMatrx from h2oXGBoost model and H2OFrame nativeXGBoostParam = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams() nativeXGBoostInput = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD) nativeModel = xgb.train(params=nativeXGBoostParam[0], dtrain=nativeXGBoostInput, num_boost_round=nativeXGBoostParam[1]) nativePred = nativeModel.predict(data=nativeXGBoostInput, ntree_limit=nativeXGBoostParam[1]) pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, -1, -1, -1, -1, tolerance=1e-6) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def scale_pos_weight_test(): assert H2OXGBoostEstimator.available() is True train = pyunit_utils.genTrainFrame(1000, 0, enumCols=10, enumFactors=2, miscfrac=0.1, randseed=17) xgboost = H2OXGBoostEstimator(ntrees=1, seed=1, scale_pos_weight=1.2) xgboost.train(y='response', training_frame=train) native_params = xgboost._model_json["output"][ "native_parameters"].as_data_frame() assert min(native_params[native_params['name'] == 'scale_pos_weight'] ["value"]) == 1.2
def comparison_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 maxdepth = 5 nrows = 10000 ncols = 12 factorL = 20 numCols = 11 enumCols = ncols-numCols responseL = 4 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "tree_method": "exact", "backend":"cpu"} trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, responseLevel=responseL, miscfrac=0.01,randseed=dataSeed) myX = trainFile.names y='response' myX.remove(y) newNames = [] for ind in range(0, len(myX)): myX[ind] = myX[ind]+"_"+str(ind) # avoid duplicated column names newNames.append(myX[ind]) newNames.append(y) trainFile.set_names(newNames) h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oPredictD = h2oModelD.predict(trainFile) # derive native XGBoost parameter and DMatrx from h2oXGBoost model and H2OFrame nativeXGBoostParam = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams() nativeXGBoostInput = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD) nativeModel = xgb.train(params=nativeXGBoostParam[0], dtrain=nativeXGBoostInput, num_boost_round=nativeXGBoostParam[1]) nativePred = nativeModel.predict(data=nativeXGBoostInput, ntree_limit=nativeXGBoostParam[1]) pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, -1, -1, -1, -1, tolerance=1e-6) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def get_native_parameters_test(): assert H2OXGBoostEstimator.available() is True ntrees = 17 # CPU Backend is forced for the results to be comparable h2oParamsS = { "ntrees": ntrees, "max_depth": 4, "seed": 1, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "sparse", "tree_method": "exact", "backend": "cpu" } nrows = 1000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame(nrows, 0, enumCols=enumCols, enumFactors=factorL, miscfrac=0.1, randseed=17) print(trainFile) myX = trainFile.names y = 'response' myX.remove(y) h2oModelS = H2OXGBoostEstimator(**h2oParamsS) h2oModelS.train(x=myX, y=y, training_frame=trainFile) print(h2oModelS._model_json["output"]["native_parameters"].as_data_frame()) assert h2oModelS._model_json["output"][ "native_parameters"]._table_header == u"Native XGBoost Parameters"
def comparison_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 h2oParamsS = { "ntrees": ntrees, "max_depth": 4, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "sparse", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsS["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'lambda': 0.0, 'eta': h2oParamsS["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsS["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsS["max_depth"] } nrows = 10000 ncols = 11 factorL = 0 numCols = 11 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, randseed=dataSeed) print(trainFile) myX = trainFile.names y = 'response' h2oModelS = H2OXGBoostEstimator(**h2oParamsS) # gather, print and save performance numbers for h2o model h2oModelS.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"] time1 = time.time() h2oPredictS = h2oModelS.predict(trainFile) h2oPredictTimeS = time.time() - time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=[]) nrounds = ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelsfound = False while not (modelsfound ): # loop to make sure accurate number of trees are built modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format( len(modelInfo), nrounds)) if len(modelInfo) >= ntrees: modelsfound = True else: nrounds = nrounds + 1 nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 print( "Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse....." ) pyunit_utils.summarizeResult_binomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS, nativeScoreTime, tolerance=1e-6) else: print( "******** Test skipped. This test cannot be performed in multinode environment." )
def comparison_test(): assert H2OXGBoostEstimator.available() is True runSeed = 1 ntrees = 10 responseL = 11 # CPU Backend is forced for the results to be comparable h2oParamsS = { "ntrees": ntrees, "max_depth": 4, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "sparse", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsS["seed"], 'booster': 'gbtree', 'objective': 'multi:softprob', 'lambda': 0.0, 'eta': h2oParamsS["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsS["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsS["max_depth"], 'num_class': responseL } nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL) # load in dataset and add response column print(trainFile) myX = trainFile.names y = 'response' myX.remove(y) nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=myX) h2oModelS = H2OXGBoostEstimator(**h2oParamsS) # gather, print and save performance numbers for h2o model h2oModelS.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"] time1 = time.time() h2oPredictS = h2oModelS.predict(trainFile) h2oPredictTimeS = time.time() - time1 # train the native XGBoost nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=10) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 print( "Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse....." ) pyunit_utils.summarizeResult_multinomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS, nativeScoreTime, tolerance=1e-6)
def comparison_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 responseL = 11 # CPU Backend is forced for the results to be comparable h2oParamsS = {"ntrees":ntrees, "max_depth":4, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsS["seed"], 'booster': 'gbtree', 'objective': 'multi:softprob', 'eta': h2oParamsS["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsS["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsS["max_depth"], 'num_class':responseL} nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols-numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL, randseed=dataSeed) # load in dataset and add response column print(trainFile) myX = trainFile.names y='response' myX.remove(y) nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=myX) h2oModelS = H2OXGBoostEstimator(**h2oParamsS) # gather, print and save performance numbers for h2o model h2oModelS.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"] time1 = time.time() h2oPredictS = h2oModelS.predict(trainFile) h2oPredictTimeS = time.time()-time1 # train the native XGBoost nrounds = ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds)) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 print("Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse.....") pyunit_utils.summarizeResult_multinomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS, nativeScoreTime, tolerance=1e-6) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 # CPU Backend is forced for the results to be comparable h2oParamsS = { "ntrees": ntrees, "max_depth": 4, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "sparse", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsS["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': h2oParamsS["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsS["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsS["max_depth"], 'eval_metric': ['auc', 'aucpr'] } nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, 0, enumCols=enumCols, enumFactors=factorL, miscfrac=0.1, randseed=dataSeed) # load in dataset and add response column print(trainFile) myX = trainFile.names y = 'response' myX.remove(y) nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=myX) h2oModelS = H2OXGBoostEstimator(**h2oParamsS) # gather, print and save performance numbers for h2o model h2oModelS.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"] time1 = time.time() h2oPredictS = h2oModelS.predict(trainFile) h2oPredictTimeS = time.time() - time1 # train the native XGBoost nrounds = ntrees evals_result = {} watch_list = [(nativeTrain, 'train')] nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds, evals=watch_list, verbose_eval=True, evals_result=evals_result) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format( len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 print( "Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse....." ) pyunit_utils.summarizeResult_binomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS, nativeScoreTime, tolerance=1e-6) print( "Comparing H2OXGBoost metrics with native XGBoost metrics when DMatrix is set to sparse....." ) h2o_metrics = [ h2oModelS.training_model_metrics()["AUC"], h2oModelS.training_model_metrics()["pr_auc"] ] xgboost_metrics = [ evals_result['train']['auc'][ntrees - 1], evals_result['train']['aucpr'][ntrees - 1] ] # TODO: less tolerance ? pyunit_utils.summarize_metrics_binomial(h2o_metrics, xgboost_metrics, ["auc", "aucpr"], tolerance=1e-3) else: print( "******** Test skipped. This test cannot be performed in multinode environment." )
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = { "ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"] } nrows = 10000 ncols = 20 factorL = 20 numCols = 10 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01, randseed=dataSeed) # load in dataset and add response column myX = trainFile.names y = 'response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] t1Array = h2oModelD._model_json["output"]["scoring_history"] print("classifier error: {0}".format( t1Array._cell_values[len(t1Array._cell_values) - 1][ t1Array._col_header.index("training_classification_error")])) time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nrounds = ntrees nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format( len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print( "******** Test skipped. This test cannot be performed in multinode environment." )
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-10 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"dense","tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'reg:linear', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 20 numCols = 5 enumCols = ncols-numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01, randseed=dataSeed) # load in dataset and add response column y='response' trainFile = trainFile.drop(y) # drop the enum response and generate real values here yresp = 0.99*pyunit_utils.random_dataset_numeric_only(nrows, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed) yresp.set_name(0, y) trainFile = trainFile.cbind(yresp) myX = trainFile.names myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nrounds=ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) # need to specify one more to get the right number nativeTrainTime = time.time()-time1 # create a "test" matrix - it will be identical to "train" matrix but it will not have any cached predictions # if we tried to use matrix `nativeTrain` predict(..) will not actually compute anything it will return the cached predictions # cached predictions are slightly different from the actual predictions nativeTest = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) time1=time.time() nativePred = nativeModel.predict(data=nativeTest, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method":"exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 11 numCols = 5 enumCols = ncols-numCols responseL = 2 trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL, randseed=dataSeed) myX = trainFile.names y='response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) nrounds = ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelsfound = False while not(modelsfound): # loop to make sure accurate number of trees are built modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds)) if len(modelInfo)>=ntrees: modelsfound=True else: nrounds=nrounds+1 nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 20 factorL = 20 numCols = 10 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01, randseed=dataSeed) # load in dataset and add response column myX = trainFile.names y = 'response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] t1Array = h2oModelD._model_json["output"]["scoring_history"] print("classifier error: {0}".format(t1Array._cell_values[len(t1Array._cell_values) - 1][t1Array._col_header.index("training_classification_error")])) time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nrounds = ntrees nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True runSeed = 1 testTol = 1e-6 ntrees = 10 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = { "ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'reg:linear', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"] } nrows = 10000 ncols = 10 factorL = 20 numCols = 5 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01) # load in dataset and add response column y = 'response' trainFile = trainFile.drop( y) # drop the enum response and generate real values here yresp = 0.99 * pyunit_utils.random_dataset_numeric_only( nrows, 1, integerR=1000000, misFrac=0) yresp.set_name(0, y) trainFile = trainFile.cbind(yresp) myX = trainFile.names myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol)
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 responseL = 11 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'auto', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'multi:softprob', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"], 'num_class':responseL} nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols-numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL, randseed=dataSeed) myX = trainFile.names y='response' myX.remove(y) enumCols = myX[0:11] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) nrounds=ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num of boosters found in model: {0} and num_boost_round specified: {1}.".format(len(modelInfo), nrounds)) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-10 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'reg:linear', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols-numCols responseL = 2 trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL, randseed=dataSeed) y='response' trainFile = trainFile.drop(y) # drop the enum response and generate real values here yresp = 0.99*pyunit_utils.random_dataset_numeric_only(10000, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed) yresp.set_name(0, y) trainFile = trainFile.cbind(yresp) myX = trainFile.names myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) nrounds=ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) nativeTrainTime = time.time()-time1 # create a "test" matrix - it will be identical to "train" matrix but it will not have any cached predictions # if we tried to use matrix `nativeTrain` predict(..) will not actually compute anything it will return the cached predictions # cached predictions are slightly different from the actual predictions nativeTest = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) time1=time.time() nativePred = nativeModel.predict(data=nativeTest, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True runSeed = random.randint(1, 1073741824) testTol = 1e-6 ntrees = 10 maxdepth = 5 nrows = random.randint(100000, 500000) ncols = random.randint(1, 10) factorL = random.randint(2, 10) numCols = random.randint(1, ncols) enumCols = ncols - numCols responseL = random.randint(3, 10) h2oParamsD = { "ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense" } nativeParam = { 'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'auto', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'multi:softprob', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"], 'num_class': responseL } trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, responseLevel=responseL, miscfrac=0.01) myX = trainFile.names y = 'response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol)