DLperf = DeepLearn.model_performance() RandomForest = H2ORandomForestEstimator(model_id='RandomForest', ntrees=10, max_depth=5, min_rows=10, seed=1111, nfolds=5, binomial_double_trees=True, keep_cross_validation_predictions=True) RandomForest.train(x=x, y=y, training_frame=train) # Eval performance: RFperf = RandomForest.model_performance() GradientBoost = H2OGradientBoostingEstimator(model_id = 'GradientBoost', nfolds=5, seed=1111, keep_cross_validation_predictions=True) GradientBoost.train(x=x, y=y, training_frame=train) GBperf = GradientBoost.model_performance() Ensemble = H2OStackedEnsembleEstimator(model_id="Ensemble", base_models=['DeepLearn', 'RandomForest', 'GradientBoost']) Ensemble.train(x=x, y=y, training_frame=train) Performance = Ensemble.model_performance() predic = Ensemble.predict(valid).as_data_frame() yhat = np.array(predic).reshape(-1,1)
data_2007['Month'] = data_2007['Month'].asfactor() data_2007['DayofMonth'] = data_2007['DayofMonth'].asfactor() data_2007['DayOfWeek'] = data_2007['DayOfWeek'].asfactor() data_2007['DepDelayed'] = data_2007['DepDelayed'].asfactor() data_2008['Month'] = data_2008['Month'].asfactor() data_2008['DayofMonth'] = data_2008['DayofMonth'].asfactor() data_2008['DayOfWeek'] = data_2008['DayOfWeek'].asfactor() data_2008['DepDelayed'] = data_2008['DepDelayed'].asfactor() # This gives the number of missing values data_2007.describe() # Create training set and test set with the labels x_cols = [ 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'Distance', 'UniqueCarrier', 'Origin', 'Dest' ] y_cols = ['DepDelayed'] # ### Gradient Boosting gb_model = H2OGradientBoostingEstimator() gb_model.train(x=x_cols, y=y_cols, training_frame=data_2007, validation_frame=data_2008) # /this gives the score of model gb_model.varimp gb_model.download_pojo('/mapr/my.cluster.com/user/*****/airlines')
def mojo_predict_csv_test(target_dir): mojo_file_name = "prostate_gbm_model.zip" mojo_zip_path = os.path.join(target_dir, mojo_file_name) prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] # Getting first row from test data frame pdf = test[1, 2:] input_csv = "%s/in.csv" % target_dir output_csv = "%s/output.csv" % target_dir h2o.export_file(pdf, input_csv) # ================================================================= # Regression # ================================================================= regression_gbm1 = H2OGradientBoostingEstimator(distribution="gaussian") regression_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train) pred_reg = regression_gbm1.predict(pdf) contribs_reg = regression_gbm1.predict_contributions(pdf) p1 = pred_reg[0, 0] print("Regression prediction: " + str(p1)) download_mojo(regression_gbm1, mojo_zip_path) print("\nPerforming Regression Prediction using MOJO @... " + target_dir) prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) print("Prediction result: " + str(prediction_result)) assert p1 == float( prediction_result[0]['predict'] ), "expected predictions to be the same for binary and MOJO model for regression" print("\nComparing Regression Contributions using MOJO @... " + target_dir) contributions_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv, predict_contributions=True) assert contributions_result is not None contributions_pandas = pandas.read_csv(output_csv) assert_frame_equal(contribs_reg.as_data_frame(use_pandas=True), contributions_pandas, check_dtype=False) # ================================================================= # Binomial # ================================================================= train[1] = train[1].asfactor() bernoulli_gbm1 = H2OGradientBoostingEstimator(distribution="bernoulli") bernoulli_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train) pred_bin = bernoulli_gbm1.predict(pdf) contribs_bin = bernoulli_gbm1.predict_contributions(pdf) binary_prediction_0 = pred_bin[0, 1] binary_prediction_1 = pred_bin[0, 2] print("Binomial prediction: p0: " + str(binary_prediction_0)) print("Binomial prediction: p1: " + str(binary_prediction_1)) download_mojo(bernoulli_gbm1, mojo_zip_path) print("\nPerforming Binomial Prediction using MOJO @... " + target_dir) prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) mojo_prediction_0 = float(prediction_result[0]['p0']) mojo_prediction_1 = float(prediction_result[0]['p1']) print("Binomial prediction: p0: " + str(mojo_prediction_0)) print("Binomial prediction: p1: " + str(mojo_prediction_1)) assert abs( binary_prediction_0 - mojo_prediction_0 ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Binomial - p0" assert abs( binary_prediction_1 - mojo_prediction_1 ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Binomial - p1" print("\nComparing Binary Classification Contributions using MOJO @... " + target_dir) contributions_bin_result = h2o.mojo_predict_csv( input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv, predict_contributions=True) assert contributions_bin_result is not None contributions_bin_pandas = pandas.read_csv(output_csv) print(contributions_bin_pandas) print(contribs_bin.as_data_frame(use_pandas=True)) assert_frame_equal(contribs_bin.as_data_frame(use_pandas=True), contributions_bin_pandas, check_dtype=False) # ================================================================= # Multinomial # ================================================================= iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) r = iris[0].runif() train = iris[r < 0.90] test = iris[r >= 0.10] # Getting first row from test data frame pdf = test[1, 0:4] input_csv = "%s/in-multi.csv" % target_dir output_csv = "%s/output.csv" % target_dir h2o.export_file(pdf, input_csv) multi_gbm = H2OGradientBoostingEstimator() multi_gbm.train(x=['C1', 'C2', 'C3', 'C4'], y='C5', training_frame=train) pred_multi = multi_gbm.predict(pdf) multinomial_prediction_1 = pred_multi[0, 1] multinomial_prediction_2 = pred_multi[0, 2] multinomial_prediction_3 = pred_multi[0, 3] print("Multinomial prediction (Binary): p0: " + str(multinomial_prediction_1)) print("Multinomial prediction (Binary): p1: " + str(multinomial_prediction_2)) print("Multinomial prediction (Binary): p2: " + str(multinomial_prediction_3)) download_mojo(multi_gbm, mojo_zip_path) print("\nPerforming Multinomial Prediction using MOJO @... " + target_dir) prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) mojo_prediction_1 = float(prediction_result[0]['Iris-setosa']) mojo_prediction_2 = float(prediction_result[0]['Iris-versicolor']) mojo_prediction_3 = float(prediction_result[0]['Iris-virginica']) print("Multinomial prediction (MOJO): p0: " + str(mojo_prediction_1)) print("Multinomial prediction (MOJO): p1: " + str(mojo_prediction_2)) print("Multinomial prediction (MOJO): p2: " + str(mojo_prediction_3)) assert abs( multinomial_prediction_1 - mojo_prediction_1 ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Multinomial - p0" assert abs( multinomial_prediction_2 - mojo_prediction_2 ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Multinomial - p1" assert abs( multinomial_prediction_3 - mojo_prediction_3 ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Multinomial - p2"
# train, test split train_audio, valid_audio = train_audio.split_frame(ratios=[0.75], seed=1) #train_audio.shape no_x_audio = len(train_audio.columns) x_audio = train_audio.columns[:no_x_audio - 1] y_audio = train_audio.columns[no_x_audio - 1] ''' modelling ''' # model initialization rf_audio = H2ORandomForestEstimator(seed=12, ntrees=50, max_depth= 20, \ balance_classes=False, nfolds = 5, \ stopping_metric = 'MSE') gbm_audio = H2OGradientBoostingEstimator(ntrees = 50, max_depth = 20, \ distribution = 'AUTO', nfolds = 5, \ stopping_metric = 'MSE') # model training model_audio = rf_audio model_audio.train(x=x_audio, y=y_audio, training_frame=train_audio, validation_frame=valid_audio) #model_audio.show() ''' performance checking ''' dev_pred = model_audio.predict(dev_audio) #dev_pred2 = model_audio.predict_leaf_node_assignment(dev_audio)
def multinomial_auc_prostate_gbm(): data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) response_col = "GLEASON" data[response_col] = data[response_col].asfactor() predictors = ["RACE", "AGE", "PSA", "DPROS", "CAPSULE", "VOL", "DCAPS"] distribution = "multinomial" # train model gbm = H2OGradientBoostingEstimator(ntrees=1, max_depth=2, nfolds=3, distribution=distribution, auc_type="WEIGHTED_OVR") gbm.train(x=predictors, y=response_col, training_frame=data) gbm.show() # get result on training data from h2o cm = gbm.confusion_matrix(data) h2o_auc_table = gbm.multinomial_auc_table(train=True) h2o_aucpr_table = gbm.multinomial_aucpr_table(train=True) print(cm) print(h2o_auc_table.as_data_frame()) print(h2o_aucpr_table.as_data_frame()) h2o_ovr_macro_auc = h2o_auc_table[3][7] h2o_ovr_weighted_auc = h2o_auc_table[3][8] h2o_ovo_macro_auc = h2o_auc_table[3][30] h2o_ovo_weighted_auc = h2o_auc_table[3][31] h2o_ovr_weighted_aucpr = h2o_aucpr_table[3][8] h2o_default_auc = gbm.auc() h2o_default_aucpr = gbm.aucpr() print("default vs. table AUC "+str(h2o_ovr_weighted_auc)+" "+str(h2o_default_auc)) print("default vs. table PR AUC "+str(h2o_ovr_weighted_aucpr)+" "+str(h2o_default_aucpr)) # default should be ovr weighted assert h2o_ovr_weighted_auc == h2o_default_auc, "default vs. table AUC "+str(h2o_ovr_weighted_auc)+" != "+str(h2o_default_auc) assert h2o_ovr_weighted_aucpr == h2o_default_aucpr, "default vs. table PR AUC "+str(h2o_ovr_weighted_aucpr)+" != "+str(h2o_default_aucpr) # transform data for sklearn prediction = gbm.predict(data).as_data_frame().iloc[:,1:] actual = data[response_col].as_data_frame().iloc[:, 0].tolist() # get result on training data from sklearn sklearn_ovr_macro_auc = roc_auc_score(actual, prediction, multi_class="ovr", average='macro') sklearn_ovr_weighted_auc = roc_auc_score(actual, prediction, multi_class="ovr", average='weighted') sklearn_ovo_macro_auc = roc_auc_score(actual, prediction, multi_class="ovo", average='macro') sklearn_ovo_weighted_auc = roc_auc_score(actual, prediction, multi_class="ovo", average='weighted') print("sklearn vs. h2o ovr macro: "+str(sklearn_ovr_macro_auc)+" "+str(h2o_ovr_macro_auc)) print("sklearn vs. h2o ovr weighted: "+str(sklearn_ovr_weighted_auc)+" "+str(h2o_ovr_weighted_auc)) print("sklearn vs. h2o ovo macro: "+str(sklearn_ovo_macro_auc)+" "+str(h2o_ovo_macro_auc)) print("sklearn vs. h2o ovo weighted: "+str(sklearn_ovo_weighted_auc)+" "+str(h2o_ovo_weighted_auc)) # compare results h2o vs sklearn precision = 1e-7 assert abs(h2o_ovr_macro_auc - sklearn_ovr_macro_auc) < precision, "sklearn vs. h2o ovr macro: "+str(sklearn_ovr_macro_auc)+" != "+str(h2o_ovr_macro_auc) assert abs(h2o_ovr_weighted_auc - sklearn_ovr_weighted_auc) < precision, "sklearn vs. h2o ovr weighted: "+str(sklearn_ovr_weighted_auc)+" != "+str(h2o_ovr_weighted_auc) assert abs(h2o_ovo_macro_auc - sklearn_ovo_macro_auc) < precision, "sklearn vs. h2o ovo macro: "+str(sklearn_ovo_macro_auc)+" != "+str(h2o_ovo_macro_auc) assert abs(h2o_ovo_weighted_auc - sklearn_ovo_weighted_auc) < precision, "sklearn vs. h2o ovo weighted: "+str(sklearn_ovo_weighted_auc)+" != "+str(h2o_ovo_weighted_auc) # set auc_type gbm = H2OGradientBoostingEstimator(ntrees=1, max_depth=2, nfolds=3, distribution=distribution, auc_type="MACRO_OVR") gbm.train(x=predictors, y=response_col, training_frame=data, validation_frame=data) h2o_auc_table = gbm.multinomial_auc_table(train=True) h2o_aucpr_table = gbm.multinomial_aucpr_table(train=True) h2o_ovr_macro_auc = h2o_auc_table[3][7] h2o_ovr_macro_aucpr = h2o_aucpr_table[3][7] h2o_default_auc = gbm.auc() h2o_default_aucpr = gbm.aucpr() assert abs(h2o_ovr_macro_auc - h2o_default_auc) < precision, "default auc vs. h2o ovr macro auc: "+str(sklearn_ovr_macro_auc)+" != "+str(h2o_default_auc) assert abs(h2o_ovr_macro_aucpr - h2o_default_aucpr) < precision, "default aucpr vs. h2o ovr macro aucpr: "+str(h2o_ovr_macro_aucpr)+" != "+str(h2o_default_aucpr) # test early stopping ntrees = 100 gbm2 = H2OGradientBoostingEstimator(ntrees=ntrees, max_depth=2, nfolds=3, distribution=distribution, score_each_iteration=True, auc_type="MACRO_OVR", stopping_metric="AUC", stopping_rounds=3) gbm2.train(x=predictors, y=response_col, training_frame=data, validation_frame=data) assert ntrees > gbm2.score_history().shape[0], "Test early stopping: Training should start early." # test performance with different auc type perf2 = gbm.model_performance(data, auc_type="WEIGHTED_OVO") perf2_auc = perf2.auc() assert abs(h2o_ovo_weighted_auc - perf2_auc) < precision, "h2o ovo weighted vs. h2o performance ovo weighted: "+str(h2o_ovo_weighted_auc)+" != "+str(perf2_auc) # test peformance with no data and auc_type is set ntrees = 2 gbm3 = H2OGradientBoostingEstimator(ntrees=ntrees, max_depth=2, nfolds=3, distribution=distribution) gbm3.train(x=predictors, y=response_col, training_frame=data, validation_frame=data) perf3 = gbm3.model_performance(train=True, auc_type="WEIGHTED_OVO") perf3_auc = perf3.auc() assert perf3_auc == "NaN", "AUC should be \"NaN\" because it is not set in model parameters and test_data is None" # test aucpr is not in cv summary print(gbm._model_json["output"]["cv_scoring_history"][0]._col_header) assert not "aucpr" in gbm.cross_validation_metrics_summary()[0], "The aucpr should not be in cross-validation metrics summary." assert "pr_auc" in gbm.cross_validation_metrics_summary()[0], "The pr_auc should be in cross-validation metrics summary."
# Split frame into two - we use one as the training frame and the second one as the validation frame splits = crimeWithWeatherHF.split_frame(ratios=[0.8]) train = splits[0] test = splits[1] # Prepare column names predictor_columns = train.drop("Arrest").col_names response_column = "Arrest" # Create and train GBM model from h2o.estimators.gbm import H2OGradientBoostingEstimator # Prepare model based on the given set of parameters gbm_model = H2OGradientBoostingEstimator(ntrees=50, max_depth=3, learn_rate=0.1, distribution="bernoulli") # Train the model gbm_model.train(x=predictor_columns, y=response_column, training_frame=train, validation_frame=test) # Create and train deeplearning model from h2o.estimators.deeplearning import H2ODeepLearningEstimator # Prepare model based on the given set of parameters dl_model = H2ODeepLearningEstimator() # Train the model
pipeline = PMMLPipeline([ ("mapper", mapper), ("uploader", H2OFrameCreator()), ("classifier", classifier) ]) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"])) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) classifier = pipeline._final_estimator store_mojo(classifier, name) store_pkl(pipeline, name) adjusted = pipeline.predict(audit_X) adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"]) store_csv(adjusted.as_data_frame(), name) if "Audit" in datasets and with_h2o: build_audit_h2o(H2OGradientBoostingEstimator(distribution = "bernoulli", ntrees = 17), "H2OGradientBoostingAudit") build_audit_h2o(H2OGeneralizedLinearEstimator(family = "binomial"), "H2OLogisticRegressionAudit") build_audit_h2o(H2ORandomForestEstimator(distribution = "bernoulli", seed = 13), "H2ORandomForestAudit") audit_dict_X = audit_X.to_dict("records") def build_audit_dict(classifier, name, with_proba = True): pipeline = PMMLPipeline([ ("dict-transformer", DictVectorizer()), ("classifier", classifier) ]) pipeline.fit(audit_dict_X, audit_y) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"])
import pandas as pd import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator con = h2o.connect(url='http://192.168.5.208:54321/') csv_data = pd.read_csv('股票数据/处理后数据/processed_601857.csv', encoding='utf8') csv_data['earn'] = csv_data['20_closing_price'] > csv_data['closing_price']*1.2 csv_data_ = h2o.H2OFrame(csv_data) model = H2OGradientBoostingEstimator(model_id='stock_601857', nfolds=10, distribution = "bernoulli", ntrees = 2000, max_depth = 10, learn_rate = 0.4, histogram_type = "UniformAdaptive", min_split_improvement = 0.000001, balance_classes = False, seed = 52345, stopping_rounds = 5, stopping_metric = 'AUC', stopping_tolerance = 0.001, col_sample_rate = 0.6, col_sample_rate_per_tree = 0.6, col_sample_rate_change_per_level = 0.6, sample_rate = 0.85, min_rows = 100, ) traning_data, test_data = csv_data_.split_frame(ratios=[0.8], destination_frames=["train_frame", "test_data"]) csv_data.keys() model.train(x=['closing_price', 'upping_ratio', 'changing_ratio', 'volume', 'upping_ratio1', 'upping_ratio2', 'upping_ratio3', 'upping_ratio4', 'upping_ratio5', 'A_index_closing_price', 'A_index_upping_money', 'A_index_upping_ratio', 'A_index_volume', 'A_index_volume_money', 'B_index_closing_price', 'B_index_upping_money', 'B_index_upping_ratio', 'B_index_volume', 'B_index_volume_money', 'top50_index_closing_price', 'top50_index_upping_money', 'top50_index_upping_ratio', 'top50_index_volume', 'top50_index_volume_money', 'sh_index_closing_price', 'sh_index_upping_money',
cum_list.append(B[k]) train = pd.concat(cum_list) spm = sp.csr_matrix(train.values) d = h2o.H2OFrame(spm) #Turn into categorical for col in cat_vars_index: d[col] = d[col].asfactor() start = time.time() #Train base models for stacked ensemble my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", nfolds=10, ntrees=5, keep_cross_validation_predictions=True, seed=1) my_gbm.train(y=-1, training_frame=d) my_rf = H2ORandomForestEstimator(nfolds=10, ntrees=5, keep_cross_validation_predictions=True, seed=1) my_rf.train(y=-1, training_frame=d) # Train a stacked ensemble using the GBM and GLM above ensemble = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf]) ensemble.train(y=-1, training_frame=d)
def test_gbm_grid_search_over_params(self): """ test_gbm_grid_search_over_params: test for condition 1 and performs the following: a. grab all truely griddable parameters and randomly or manually set the parameter values. b. Next, build H2O GBM models using grid search. Count and make sure models are only built for hyper-parameters set to legal values. No model is built for bad hyper-parameters values. We should instead get a warning/error message printed out. c. For each model built using grid search, we will extract the parameters used in building that model and manually build a H2O GBM model. MSEs are calculated from a test set to compare the performance of grid search model and our manually built model. If their MSEs are close, declare test success. Otherwise, declare test failure. d. we will check and make sure the models are built within the max_runtime_secs time limit that was set for it as well. If max_runtime_secs was exceeded, declare test failure as well. """ print( "*******************************************************************************************" ) print("test_gbm_grid_search_over_params for GBM " + self.family) h2o.cluster_info() try: print("Hyper-parameters used here is {0}".format( self.final_hyper_params)) # start grid search grid_model = H2OGridSearch(H2OGradientBoostingEstimator( distribution=self.family, nfolds=self.nfolds, seed=self.seed), hyper_params=self.final_hyper_params) grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) self.correct_model_number = len( grid_model) # store number of models built # make sure the correct number of models are built by gridsearch if not (self.correct_model_number == self.possible_number_models): # wrong grid model number self.test_failed += 1 print( "test_gbm_grid_search_over_params for GBM failed: number of models built by gridsearch " "does not equal to all possible combinations of hyper-parameters" ) else: # add parameters into params_dict. Use this to manually build model params_dict = dict() params_dict["distribution"] = self.family params_dict["nfolds"] = self.nfolds params_dict["seed"] = self.seed total_run_time_limits = 0.0 # calculate upper bound of max_runtime_secs true_run_time_limits = 0.0 manual_run_runtime = 0.0 # compare MSE performance of model built by gridsearch with manually built model for each_model in grid_model: params_list = grid_model.get_hyperparams_dict( each_model._id) params_list.update(params_dict) model_params = dict() # need to taken out max_runtime_secs from model parameters, it is now set in .train() if "max_runtime_secs" in params_list: model_params["max_runtime_secs"] = params_list[ "max_runtime_secs"] max_runtime = params_list["max_runtime_secs"] del params_list["max_runtime_secs"] else: max_runtime = 0 if "r2_stopping" in params_list: model_params["r2_stopping"] = params_list[ "r2_stopping"] del params_list["r2_stopping"] if "validation_frame" in params_list: model_params["validation_frame"] = params_list[ "validation_frame"] del params_list["validation_frame"] if "learn_rate_annealing" in params_list: model_params["learn_rate_annealing"] = params_list[ "learn_rate_annealing"] del params_list["learn_rate_annealing"] # make sure manual model was provided the same max_runtime_secs as the grid model each_model_runtime = pyunit_utils.find_grid_runtime( [each_model]) manual_model = H2OGradientBoostingEstimator(**params_list) manual_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data, **model_params) # collect the time taken to manually built all models model_runtime = pyunit_utils.find_grid_runtime( [manual_model]) # time taken to build this model manual_run_runtime += model_runtime summary_list = manual_model._model_json['output'][ 'model_summary'] tree_num = summary_list.cell_values[0][ summary_list.col_header.index('number_of_trees')] if max_runtime > 0: # shortest possible time it takes to build this model if (max_runtime < self.min_runtime_per_tree) or (tree_num <= 1): total_run_time_limits += model_runtime else: total_run_time_limits += max_runtime true_run_time_limits += max_runtime # compute and compare test metrics between the two models test_grid_model_metrics = each_model.model_performance( )._metric_json[self.training_metric] test_manual_model_metrics = manual_model.model_performance( )._metric_json[self.training_metric] # just compare the mse in this case within tolerance: if (each_model_runtime > 0) and \ (abs(model_runtime - each_model_runtime)/each_model_runtime < self.allowed_runtime_diff) \ and (abs(test_grid_model_metrics - test_manual_model_metrics) > self.allowed_diff): # self.test_failed += 1 # count total number of tests that have failed print( "test_gbm_grid_search_over_params for GBM warning: grid search model mdetric ({0}) and " "manually built H2O model metric ({1}) differ too much" "!".format(test_grid_model_metrics, test_manual_model_metrics)) total_run_time_limits = max( total_run_time_limits, true_run_time_limits) * (1 + self.extra_time_fraction) # make sure the max_runtime_secs is working to restrict model built time if not (manual_run_runtime <= total_run_time_limits): self.test_failed += 1 print( "test_gbm_grid_search_over_params for GBM failed: time taken to manually build models is {0}." " Maximum allowed time is {1}".format( manual_run_runtime, total_run_time_limits)) else: print( "time taken to manually build all models is {0}. Maximum allowed time is " "{1}".format(manual_run_runtime, total_run_time_limits)) if self.test_failed == 0: print( "test_gbm_grid_search_over_params for GBM has passed!") except: if self.possible_number_models > 0: print( "test_gbm_grid_search_over_params for GBM failed: exception was thrown for no reason." ) self.test_failed += 1
def stackedensemble_metalearner_test(): """This test checks the following: 1) That H2OStackedEnsembleEstimator `metalearner_nfolds` works correctly 2) That H2OStackedEnsembleEstimator `metalearner_nfolds` works in concert with `metalearner_nfolds` """ # Import training set train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"), destination_frame="higgs_test_5k") # Identify predictors and response x = train.columns y = "response" x.remove(y) # Convert response to a factor train[y] = train[y].asfactor() test[y] = test[y].asfactor() # Set number of folds for base learners nfolds = 3 # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=50, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) def train_ensemble_using_metalearner(algo, expected_algo): print("Training ensemble using {} metalearner.".format(algo)) meta_params = dict(metalearner_nfolds=3) se = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm=algo, **meta_params) se.train(x=x, y=y, training_frame=train) assert(se.params['metalearner_algorithm']['actual'] == expected_algo) if meta_params: assert(se.params['metalearner_nfolds']['actual'] == 3) meta = h2o.get_model(se.metalearner()['name']) assert(meta.algo == expected_algo), "Expected that the metalearner would use {}, but actually used {}.".format(expected_algo, meta.algo) if meta_params: assert(meta.params['nfolds']['actual'] == 3) metalearner_algos = ['AUTO', 'deeplearning', 'drf', 'gbm', 'glm', 'naivebayes', 'xgboost'] for algo in metalearner_algos: expected_algo = 'glm' if algo == 'AUTO' else algo train_ensemble_using_metalearner(algo, expected_algo)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GBM. 2. It will find the intersection of parameters that are both griddable and used by GBM. 3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGradientBoostingEstimator(distribution=self.family, seed=self.seed, nfolds=self.nfolds) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime( [model]) # find model train time print("Time taken to build a base barebone model is {0}".format( self.model_run_time)) summary_list = model._model_json["output"]["model_summary"] num_trees = summary_list.cell_values[0][summary_list.col_header.index( 'number_of_trees')] if num_trees == 0: self.min_runtime_per_tree = self.model_run_time else: self.min_runtime_per_tree = self.model_run_time / num_trees # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameters time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [ time_scale * x for x in self.hyper_params["max_runtime_secs"] ] # generate a new final_hyper_params which only takes a subset of all griddable parameters while # hyper_params take all griddable parameters and generate the grid search hyper-parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params[ "max_runtime_secs"] len_good_time = len( [x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models * len_good_time if "fold_assignment" in list(self.final_hyper_params): self.possible_number_models = self.possible_number_models * self.scale_model # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
data = recode_cc_data(data) data[y] = data[y].asfactor() data.describe() train, test = data.split_frame([0.7], seed=12345) # summarize split print('Train data rows = %d, columns = %d' % (train.shape[0], train.shape[1])) print('Test data rows = %d, columns = %d' % (test.shape[0], test.shape[1])) model = H2OGradientBoostingEstimator(ntrees=150, # maximum 150 trees in GBM max_depth=4, # trees can have maximum depth of 4 sample_rate=0.9, # use 90% of rows in each iteration (tree) col_sample_rate=0.9, # use 90% of variables in each iteration (tree) stopping_rounds=5, # stop if validation error does not decrease for 5 iterations (trees) score_tree_interval=1, # for reproducibility, set higher for bigger data seed=12345) # random seed for reproducibility # train a GBM model model.train(y=y, x=X, training_frame=train, validation_frame=test) # print AUC print('GBM Test AUC = %.2f' % model.auc(valid=True)) row = test[test['ID'] == 29116] row def generate_local_sample(row, frame, X, N=1000):
epochs=100) model_dl.train( x= features, y="loan_status", training_frame=train_split, validation_frame=valid_split) model_dl.params print(model_dl) #GBM from h2o.estimators.gbm import H2OGradientBoostingEstimator model_gbm = H2OGradientBoostingEstimator(distribution='bernoulli', ntrees=100, max_depth=4, learn_rate=0.1) model_gbm.train(x=features, y="loan_status", training_frame=train_split, validation_frame=valid_split) print(model_gbm) #GBM with cross validation cvmodel = H2OGradientBoostingEstimator(distribution='bernoulli', ntrees=100, max_depth=4, learn_rate=0.1, nfolds=5) cvmodel.train(x=features, y="loan_status", training_frame=train) print(cvmodel)
def metric_json_check(): df = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) # Regression metric json reg_mod = H2OGradientBoostingEstimator(distribution="gaussian") reg_mod.train(x=list(range(3,df.ncol)), y="CAPSULE", training_frame=df) reg_met = reg_mod.model_performance() reg_metric_json_keys_have = list(reg_met._metric_json.keys()) reg_metric_json_keys_desired = [u'model_category', u'description', u'r2', u'frame', u'model_checksum', u'MSE', u'RMSE', u'mae', u'rmsle', u'__meta', u'_exclude_fields', u'scoring_time', u'predictions', u'model', u'duration_in_ms', u'frame_checksum', u'nobs', u'mean_residual_deviance', u'custom_metric_name', u'custom_metric_value'] reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Regression metric json (GLM) reg_mod = H2OGeneralizedLinearEstimator(family="gaussian") reg_mod.train(x=list(range(3,df.ncol)), y="CAPSULE", training_frame=df) reg_met = reg_mod.model_performance() reg_metric_json_keys_have = list(reg_met._metric_json.keys()) reg_metric_json_keys_desired = [u'model_category', u'description', u'r2', u'residual_degrees_of_freedom', u'frame', u'model_checksum', u'MSE', u'RMSE', u'mae', u'rmsle', u'__meta', u'_exclude_fields', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'predictions', u'AIC', u'model', u'duration_in_ms', u'frame_checksum', u'nobs', u'residual_deviance', u'mean_residual_deviance', u'custom_metric_name', u'custom_metric_value'] reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff) # Binomial metric json bin_mod = H2OGradientBoostingEstimator(distribution="bernoulli") df["CAPSULE"] = df["CAPSULE"].asfactor() bin_mod.train(x=list(range(3,df.ncol)), y="CAPSULE", training_frame=df) bin_met = bin_mod.model_performance() bin_metric_json_keys_have = list(bin_met._metric_json.keys()) bin_metric_json_keys_desired = [u'AUC', u'Gini', u'model_category', u'description', u'mean_per_class_error', u'r2', u'frame', u'model_checksum', u'MSE', u'RMSE', u'__meta', u'_exclude_fields', u'gains_lift_table', u'logloss', u'scoring_time', u'thresholds_and_metric_scores', u'predictions', u'max_criteria_and_metric_scores', u'model', u'duration_in_ms', u'frame_checksum', u'nobs', u'domain', u'custom_metric_name', u'custom_metric_value', u'pr_auc'] bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Binomial metric json (GLM) bin_mod = H2OGeneralizedLinearEstimator(family="binomial") bin_mod.train(x=list(range(3,df.ncol)), y="CAPSULE", training_frame=df) bin_metric_json_keys_have = list(bin_met._metric_json.keys()) bin_metric_json_keys_desired = [u'frame', u'residual_deviance', u'max_criteria_and_metric_scores', u'MSE', u'RMSE', u'frame_checksum', u'nobs', u'AIC', u'logloss', u'Gini', u'predictions', u'AUC', u'description', u'mean_per_class_error', u'model_checksum', u'duration_in_ms', u'model_category', u'gains_lift_table', u'r2', u'residual_degrees_of_freedom', u'__meta', u'_exclude_fields', u'null_deviance', u'scoring_time', u'null_degrees_of_freedom', u'model', u'thresholds_and_metric_scores', u'domain', u'custom_metric_name', u'custom_metric_value', u'pr_auc'] bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff) # Multinomial metric json df = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) myX = ["Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance", "fDayofMonth", "fDayOfWeek"] myY = "fYear" mul_mod = H2OGradientBoostingEstimator(distribution="multinomial") mul_mod.train(x=myX, y=myY, training_frame=df) mul_met = mul_mod.model_performance() mul_metric_json_keys_have = list(mul_met._metric_json.keys()) mul_metric_json_keys_desired = [u'cm', u'model_category', u'description', u'mean_per_class_error', u'r2', u'frame', u'nobs', u'model_checksum', u'MSE', u'RMSE', u'__meta', u'_exclude_fields', u'logloss', u'scoring_time', u'predictions', u'hit_ratio_table', u'model', u'duration_in_ms', u'frame_checksum', u'custom_metric_name', u'custom_metric_value'] mul_metric_diff = list(set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired)) assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \ "metric json. The difference is {2}".format(mul_metric_json_keys_have, mul_metric_json_keys_desired, mul_metric_diff) # Clustering metric json df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) from h2o.estimators.kmeans import H2OKMeansEstimator clus_mod = H2OKMeansEstimator(k=3, standardize=False) clus_mod.train(x=list(range(4)), training_frame=df) clus_met = clus_mod.model_performance() clus_metric_json_keys_have = list(clus_met._metric_json.keys()) clus_metric_json_keys_desired = [u'tot_withinss', u'model_category', u'description', u'frame', u'model_checksum', u'MSE', u'RMSE', u'__meta', u'_exclude_fields', u'scoring_time', u'betweenss', u'predictions', u'totss', u'model', u'duration_in_ms', u'frame_checksum', u'nobs', u'centroid_stats', u'custom_metric_name', u'custom_metric_value'] clus_metric_diff = list(set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired)) assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \ "metric json. The difference is {2}".format(clus_metric_json_keys_have, clus_metric_json_keys_desired, clus_metric_diff)
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that the max_runtime_secs can restrict the model training time for all h2o algos. See PUBDEV-4702. ''' global model_within_max_runtime global err_bound seed = 12345 # GLRM, do not make sense to stop in the middle of an iteration training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=0.3, gamma_y=0.3, transform="STANDARDIZE", seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices) cleanUp([training1_data, model]) # deeplearning training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Not an iterative algo." ) # GBM run training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([model]) # GLM run model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([model]) # naivebayes, not iterative print( "******************** Skip testing Naives Bayes. Not an iterative algo." ) # random foreset model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0, seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices) cleanUp([model, training1_data]) # PCA training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True, seed=seed) grabRuntimeInfo(err_bound * 5, 2, model, training1_data, x_indices) cleanUp([training1_data, model]) # kmeans training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = list(range(training1_data.ncol)) model = H2OKMeansEstimator(k=10, seed=seed) grabRuntimeInfo(err_bound * 2, 2.5, model, training1_data, x_indices) cleanUp([training1_data, model]) # word2vec train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() grabRuntimeInfo(err_bound, 2.0, w2v_model, used, [], 0) cleanUp([train, used, w2v_model]) if sum(model_within_max_runtime) > 0: sys.exit(1)
# Now, train the GBM model: from h2o.estimators.gbm import H2OGradientBoostingEstimator # Load the data and prepare for modeling airlines_hex = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/airlines/allyears2k_headers.zip") # Generate random numbers and create training, validation, testing splits r = airlines_hex.runif() # Random UNIForm numbers, one per row air_train_hex = airlines_hex[r < 0.6] air_valid_hex = airlines_hex[(r >= 0.6) & (r < 0.9)] air_test_hex = airlines_hex[r >= 0.9] myX = ["DayofMonth", "DayOfWeek"] air_model = H2OGradientBoostingEstimator( distribution='bernoulli', ntrees=100, max_depth=4, learn_rate=0.1) air_model.train(x=myX, y="IsDepDelayed", training_frame=air_train_hex)
def cars_checkpoint(): cars = h2o.upload_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) s = cars.runif() train = cars[s > .2] valid = cars[s <= .2] print("\n*** Description (chunk distribution, etc) of training frame:") train.describe() print("\n*** Description (chunk distribution, etc) of validation frame:") valid.describe() # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(list(range(3)), 1)[0] # pick the predictors and response column, along with the correct distribution predictors = ["displacement", "power", "weight", "acceleration", "year"] if problem == 1: response_col = "economy_20mpg" distribution = "bernoulli" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() elif problem == 2: response_col = "cylinders" distribution = "multinomial" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() else: response_col = "economy" distribution = "gaussian" print("\n*** Distribution: {0}".format(distribution)) print("\n*** Response column: {0}".format(response_col)) # build first model ntrees1 = 5 max_depth1 = random.sample(list(range(2, 6)), 1)[0] min_rows1 = random.sample(list(range(10, 16)), 1)[0] print("\n*** Building model 1 with the following parameters:") print("*** ntrees model 1: {0}".format(ntrees1)) print("*** max_depth model 1: {0}".format(max_depth1)) print("*** min_rows model 1: {0}".format(min_rows1)) from h2o.estimators.gbm import H2OGradientBoostingEstimator model1 = H2OGradientBoostingEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, score_each_iteration=True, distribution=distribution) model1.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) # model1 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees1, # max_depth=max_depth1, # min_rows=min_rows1, # score_each_iteration=True, # distribution=distribution, # validation_x=valid[predictors], # validation_y=valid[response_col]) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 5 max_depth2 = max_depth1 min_rows2 = min_rows1 print( "\n*** Continuing to build model 1 (now called model 2) with the following parameters:" ) print("*** ntrees model 2: {0}".format(ntrees2)) print("*** max_depth model 2: {0}".format(max_depth2)) print("*** min_rows model 2: {0}".format(min_rows2)) model2 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, checkpoint=restored_model._id) model2.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) # model2 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees2, # max_depth=max_depth2, # min_rows=min_rows2, # distribution=distribution, # score_each_iteration=True, # validation_x=valid[predictors], # validation_y=valid[response_col], # checkpoint=restored_model._id) # continue building the model, but with different number of trees ntrees3 = ntrees2 + 50 max_depth3 = max_depth1 min_rows3 = min_rows1 print( "\n*** Continuing to build model 1 (now called model 3) with the following parameters:" ) print("*** ntrees model 3: {0}".format(ntrees3)) print("*** max_depth model 3: {0}".format(max_depth3)) print("*** min_rows model 3: {0}".format(min_rows3)) model3 = H2OGradientBoostingEstimator(ntrees=ntrees3, max_depth=max_depth3, min_rows=min_rows3, distribution=distribution, score_each_iteration=True, checkpoint=restored_model._id) model3.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) # model3 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees3, # max_depth=max_depth3, # min_rows=min_rows3, # distribution=distribution, # score_each_iteration=True, # validation_x=valid[predictors], # validation_y=valid[response_col], # checkpoint=restored_model._id) # build the equivalent of model 2 in one shot print( "\n*** Building the equivalent of model 2 (called model 4) in one shot:" ) model4 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True) model4.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid) # model4 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees2, # max_depth=max_depth2, # min_rows=min_rows2, # distribution=distribution, # score_each_iteration=True, # validation_x=valid[predictors], # validation_y=valid[response_col]) print("\n*** Model Summary for model 2:") print(model2.summary()) print("\n*** Model Summary for model 3:") print(model3.summary()) print("\n*** Model Summary for model 4:") print(model4.summary()) print("\n*** Score History for model 2:") print(model2.scoring_history()) print("\n*** Score History for model 3:") print(model3.scoring_history()) print("\n*** Score History for model 4:") print(model4.scoring_history()) # checks if problem == 0: assert isinstance(model2, type(model4)) assert model2.mse(valid=True) == model4.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) elif problem == 1: assert isinstance(model2, type(model4)) assert model2.auc(valid=True) == model4.auc( valid=True ), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format( model2.auc(valid=True), model4.auc(valid=True)) #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True)) assert model2.logloss(valid=True) == model4.logloss( valid=True ), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format( model2.logloss(valid=True), model4.logloss(valid=True)) #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) assert model2.giniCoef(valid=True) == model4.giniCoef( valid=True ), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format( model2.giniCoef(valid=True), model4.giniCoef(valid=True)) #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) else: assert isinstance(model2, type(model4)) assert model2.mse(valid=True) == model4.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) assert model2.r2(valid=True) == model4.r2( valid=True ), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format( model2.r2(valid=True), model4.r2(valid=True))
def main_Calib(filename, output, mode, alg, basis, order, figure, verbose, offset, qt, pre, split): ''' # main program # input: radius: %+.3f, 'str' (in makefile, str is default) # path: file storage path, 'str' # fout: file output name as .h5, 'str' (.h5 not included') # cut_max: cut off of Legendre # output: the gathered result EventID, ChannelID, x, y, z ''' if pre != 'r': print('begin reading file', flush=True) EventID, ChannelID, Q, PETime, photonTime, PulseTime, dETime, x, y, z = pub.ReadFile(filename) VertexTruth = (np.vstack((x, y, z))/1e3).T if(offset): off = pub.LoadBase(offset) else: off = np.zeros_like(PMTPos[:,0]) print('total event: %d' % np.size(np.unique(EventID)), flush=True) print('begin processing legendre coeff', flush=True) # this part for the same vertex tmp = time.time() EventNo = np.size(np.unique(EventID)) PMTNo = np.size(PMTPos[:,0]) if mode == 'PE': PMTPosRep = np.tile(PMTPos, (EventNo,1)) vertex = np.repeat(VertexTruth, PMTNo, axis=0) elif mode == 'time': counts = np.bincount(EventID) counts = counts[counts!=0] PMTPosRep = PMTPos[ChannelID] vertex = np.repeat(VertexTruth, counts, axis=0) elif mode == 'combined': PMTPosRep = np.tile(PMTPos, (EventNo,1)) vertex = np.repeat(VertexTruth, PMTNo, axis=0) if basis == 'Legendre': X, cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=True) elif basis == 'Zernike': from zernike import RZern cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=False) cart = RZern(order) nk = cart.nk m = cart.mtab n = cart.ntab rho = np.linalg.norm(vertex, axis=1)/0.65 theta = np.arccos(cos_theta) X = np.zeros((rho.shape[0], nk)) for i in np.arange(nk): if not i % 5: print(f'process {i}-th event') X[:,i] = cart.Zk(i, rho, theta) X = X[:,m>=0] print(f'rank: {np.linalg.matrix_rank(X)}') print(f'use {time.time() - tmp} s') # which info should be used if mode == 'PE': y = Q elif mode == 'time': y = PulseTime elif mode == 'combined': # PulseTime = PulseTime - np.min(PulseTime) # PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2 # print(np.min(PulseTime), np.max(PulseTime)) PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2 bins = np.arange(-1, 0.05, 0.1) N = 10 # Legendre coeff x = pub.legval(bins, np.eye(N).reshape(N, N, 1)) # 1st basis Y = np.tile(x, len(np.unique(EventID))*len(np.unique(ChannelID))).T # 2nd basis X = np.repeat(X, bins.shape[0], axis=0) # output y = np.zeros((len(np.unique(EventID)), len(np.unique(ChannelID)), len(bins))) ''' basis = np.zeros((X.shape[0], X.shape[1]*Y.shape[1])) for i_index, i in enumerate(np.arange(X.shape[1])): for j_index, j in enumerate(np.arange(Y.shape[1])): total_index = i_index*Y.shape[1] + j_index if not total_index % 10: print(total_index) basis[:, total_index] = X[:,i_index]*Y[:,j_index] X = basis ''' split_index = np.unique(EventID).shape[0] for k_index, k in enumerate(np.unique(EventID)): # event begin with 1 if k_index > split_index * split: break if not k % 100: print(k) index = EventID == k CID = ChannelID[index] Pulse_t = PulseTime[index] for i in np.unique(CID): # PMT begin with 0 y[k_index, i, 1:], _ = np.histogram(Pulse_t[CID==i], bins=bins) y = np.reshape(y,(-1)) if verbose: print(f'the basis shape is {X.shape}, and the dependent variable shape is {y.shape}') if pre =='w': if split != 1: split_index = np.int(split*y.shape[0]) X = X[:split_index] Y = Y[:split_index] y = y[:split_index] import pandas as pd import pyarrow as pa import pyarrow.parquet as pq y = np.atleast_2d(y).T #data = np.hstack((X, y, np.ones_like(y))) df_X = pd.DataFrame(X) X_names = [] for i in df_X.columns: X_names.append('X' + str(i)) df_X.columns = X_names df_Y = pd.DataFrame(Y) Y_names = [] for i in df_Y.columns: Y_names.append('Y' + str(i)) df_Y.columns = Y_names df_y = pd.DataFrame(y) df_y.columns = ['output'] df = pd.concat([df_X, df_Y, df_y], axis=1) table = pa.Table.from_pandas(df) pq.write_table(table, 'test1.parquet') return if not pre: # Regression methods: if alg == 'sms': import statsmodels.api as sm if mode == 'PE': model = sm.GLM(y, X, family=sm.families.Poisson(), fit_intercept=False) result = model.fit() if verbose: print(result.summary()) AIC = result.aic coef_ = result.params std = result.bse elif mode == 'time': import pandas as pd data = pd.DataFrame(data = np.hstack((X, np.atleast_2d(y).T))) strs = 'y ~ ' start = data.keys().start stop = data.keys().stop step = data.keys().step cname = [] cname.append('X0') for i in np.arange(start+1, stop, step): if i == start + 1: strs += 'X%d ' % i elif i == stop - step: pass else: strs += ' + X%d ' % i if i == stop - step: cname.append('y') else: cname.append('X%d' % i) data.columns = cname mod = sm.formula.quantreg(strs, data[cname]) result = mod.fit(q=qt,) coef_ = result.params AIC = np.zeros_like(coef_) std = np.zeros_like(coef_) print('Waring! No AIC and std value') elif mode == 'combined': # data = pd.DataFrame(data = np.hstack((basis, np.atleast_2d(y).T))) with h5py.File(output,'w') as out: out.create_dataset('X', data = X) out.create_dataset('Y', data = y) print('begin...') model = sm.GLM(y, X, family=sm.families.Poisson()) result = model.fit() if verbose: print(result.summary()) coef_ = result.params std = result.bse AIC = result.aic if verbose: print(result.summary()) elif (alg == 'custom'): from scipy.optimize import minimize x0 = np.zeros_like(X[0]) # initial value (be careful of Zernike order) if mode == 'PE': x0[0] = 0.8 + np.log(2) # intercept is much more important result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X)) elif mode == 'time': x0[0] = np.mean(y) qt = 0.1 ts = 2.6 result = minimize(pub.CalibTime, x0=x0, method='SLSQP', args = (np.hstack((EventID, EventID)), y, X, qt, ts)) elif mode == 'combined': x0 = np.zeros_like(X[0]) x0[0] = 0.8 + np.log(2) # intercept is much more important result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X)) coef_ = np.array(result.x, dtype=float) if verbose: print(result.message) AIC = np.zeros_like(coef_) std = np.zeros_like(coef_) H = pub.MyHessian(result.x, pub.CalibPE, *(y, PMTPos, X)) # H = pub.MyHessian(result.x, *(Q, PMTPos, X, pub.CalibTime)) # std = 1/np.sqrt(-np.diag(np.linalg.pinv(H1))) print(coef_) # print(std) print('Waring! No AIC and std value, std is testing') elif alg == 'sk': from sklearn.linear_model import TweedieRegressor alpha = 0.001 reg = TweedieRegressor(power=1, alpha=alpha, link='log', max_iter=1000, tol=1e-6, fit_intercept=False) reg.fit(X, y) # just for point data # pred = reg.predict(X[0:30,0:cut+1]) print('coeff:\n', reg.coef_,'\n') coef_ = reg.coef_ AIC = np.zeros_like(coef_) std = np.zeros_like(coef_) print('Waring! No AIC and std value') elif alg == 'h2o': import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator if mode != 'combined': y = np.atleast_2d(y).T data = np.hstack((X, y, np.ones_like(y))) h2o.init() hf = h2o.H2OFrame(data) predictors = hf.columns[0:-2] response_col = hf.columns[-2] if mode == 'PE': #offset_col = hf.columns[-1] glm_model = H2OGeneralizedLinearEstimator(family= "poisson", #offset_column = offset_col, lambda_ = 0, compute_p_values = True) glm_model.train(predictors, response_col, training_frame=hf) coef_table = glm_model._model_json['output']['coefficients_table'] coef_ = glm_model.coef() elif mode == 'time': gbm = H2OGradientBoostingEstimator(distribution="quantile", seed = 1234, stopping_metric = "mse", stopping_tolerance = 1e-4) gbm.train(x = predictors, y = response_col, training_frame = hf) breakpoint() print(gbm) exit() elif mode == 'combined': y = np.atleast_2d(y).T data = np.hstack((X, Y, y, np.ones_like(y))) h2o.init() hf = h2o.H2OFrame(data) predictors = hf.columns[0:-2] response_col = hf.columns[-2] if verbose: print(coef_) if basis == 'Zernike': print(f'Regession coef shape is f{np.array(coef_).shape}, Zernike shape is {nk}') coef_ = coef_table['coefficients'] std = coef_table['std_error'] AIC = glm_model.aic() h2o.cluster().shutdown() elif pre == 'r': import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator h2o.init() hf = h2o.import_file("electron-1.parquet") pairs = [] for i in hf.columns: for j in hf.columns: if (i.startswith('Z') and j.startswith('L')): if ((i!='X0') and (j != 'Y0')): pairs.append((i,j)) predictors = hf.columns[2:] response_col = hf.columns[0] print(predictors) print(response_col) print(pairs) if mode == 'PE': #offset_col = hf.columns[-1] glm_model = H2OGeneralizedLinearEstimator(family= "poisson", #offset_column = offset_col, lambda_ = 0, compute_p_values = True) glm_model.train(predictors, response_col, training_frame=hf) elif mode == 'combined': #offset_col = hf.columns[-1] glm_model = H2OGeneralizedLinearEstimator(family= "poisson", #offset_column = offset_col, interaction_pairs=pairs, lambda_ = 0, #remove_collinear_columns = True, compute_p_values = True) glm_model.train(predictors, response_col, training_frame=hf) breakpoint() coef_table = glm_model._model_json['output']['coefficients_table'] coef_ = coef_table['coefficients'] std = coef_table['std_error'] AIC = glm_model.aic() print(f'Regession coef is f{np.array(coef_)}') if (figure=='ON'): import matplotlib.pyplot as plt L, K = 500, 500 ddx = np.linspace(-1.0, 1.0, K) ddy = np.linspace(-1.0, 1.0, L) xv, yv = np.meshgrid(ddx, ddy) cart.make_cart_grid(xv, yv) # normal scale # im = plt.imshow(np.exp(cart.eval_grid(np.array(coef_), matrix=True)), origin='lower', extent=(-1, 1, -1, 1)) # log scale im = plt.imshow(cart.eval_grid(np.array(coef_), matrix=True), origin='lower', extent=(-1, 1, -1, 1)) plt.colorbar() plt.savefig('test.png') else: print('error regression algorithm') with h5py.File(output,'w') as out: out.create_dataset('coeff' + str(order), data = coef_) out.create_dataset('std' + str(order), data = std) out.create_dataset('AIC' + str(order), data = AIC)
def multinomial_auc_prostate_gbm(): data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) response_col = "GLEASON" data[response_col] = data[response_col].asfactor() predictors = ["RACE", "AGE", "PSA", "DPROS", "CAPSULE", "VOL", "DCAPS"] distribution = "multinomial" # train model gbm = H2OGradientBoostingEstimator(ntrees=1, max_depth=2, nfolds=3, distribution=distribution, auc_type="WEIGHTED_OVR") gbm.train(x=predictors, y=response_col, training_frame=data) gbm.show() # get result on training data from h2o cm = gbm.confusion_matrix(data) h2o_auc_table = gbm.multinomial_auc_table(train=True) h2o_aucpr_table = gbm.multinomial_aucpr_table(train=True) print(cm) print(h2o_auc_table.as_data_frame()) print(h2o_aucpr_table.as_data_frame()) h2o_ovr_macro_auc = h2o_auc_table[3][7] h2o_ovr_weighted_auc = h2o_auc_table[3][8] h2o_ovo_macro_auc = h2o_auc_table[3][30] h2o_ovo_weighted_auc = h2o_auc_table[3][31] h2o_ovr_weighted_aucpr = h2o_aucpr_table[3][8] h2o_default_auc = gbm.auc() h2o_default_aucpr = gbm.aucpr() print("default vs. table AUC "+str(h2o_ovr_weighted_auc)+" "+str(h2o_default_auc)) print("default vs. table PR AUC "+str(h2o_ovr_weighted_aucpr)+" "+str(h2o_default_aucpr)) # default should be ovr weighted assert h2o_ovr_weighted_auc == h2o_default_auc, "default vs. table AUC "+str(h2o_ovr_weighted_auc)+" != "+str(h2o_default_auc) assert h2o_ovr_weighted_aucpr == h2o_default_aucpr, "default vs. table PR AUC "+str(h2o_ovr_weighted_aucpr)+" != "+str(h2o_default_aucpr) # transform data for sklearn prediction = gbm.predict(data).as_data_frame().iloc[:,1:] actual = data[response_col].as_data_frame().iloc[:, 0].tolist() # get result on training data from sklearn sklearn_ovr_macro_auc = roc_auc_score(actual, prediction, multi_class="ovr", average='macro') sklearn_ovr_weighted_auc = roc_auc_score(actual, prediction, multi_class="ovr", average='weighted') sklearn_ovo_macro_auc = roc_auc_score(actual, prediction, multi_class="ovo", average='macro') sklearn_ovo_weighted_auc = roc_auc_score(actual, prediction, multi_class="ovo", average='weighted') print("sklearn vs. h2o ovr macro: "+str(sklearn_ovr_macro_auc)+" "+str(h2o_ovr_macro_auc)) print("sklearn vs. h2o ovr weighted: "+str(sklearn_ovr_weighted_auc)+" "+str(h2o_ovr_weighted_auc)) print("sklearn vs. h2o ovo macro: "+str(sklearn_ovo_macro_auc)+" "+str(h2o_ovo_macro_auc)) print("sklearn vs. h2o ovo weighted: "+str(sklearn_ovo_weighted_auc)+" "+str(h2o_ovo_weighted_auc)) # compare results h2o vs sklearn precision = 1e-7 assert abs(h2o_ovr_macro_auc - sklearn_ovr_macro_auc) < precision, "sklearn vs. h2o ovr macro: "+str(sklearn_ovr_macro_auc)+" != "+str(h2o_ovr_macro_auc) assert abs(h2o_ovr_weighted_auc - sklearn_ovr_weighted_auc) < precision, "sklearn vs. h2o ovr weighted: "+str(sklearn_ovr_weighted_auc)+" != "+str(h2o_ovr_weighted_auc) assert abs(h2o_ovo_macro_auc - sklearn_ovo_macro_auc) < precision, "sklearn vs. h2o ovo macro: "+str(sklearn_ovo_macro_auc)+" != "+str(h2o_ovo_macro_auc) assert abs(h2o_ovo_weighted_auc - sklearn_ovo_weighted_auc) < precision, "sklearn vs. h2o ovo weighted: "+str(sklearn_ovo_weighted_auc)+" != "+str(h2o_ovo_weighted_auc) # set auc_type gbm = H2OGradientBoostingEstimator(ntrees=1, max_depth=2, nfolds=3, distribution=distribution, auc_type="MACRO_OVR") gbm.train(x=predictors, y=response_col, training_frame=data, validation_frame=data) h2o_auc_table = gbm.multinomial_auc_table(train=True) h2o_aucpr_table = gbm.multinomial_aucpr_table(train=True) h2o_ovr_macro_auc = h2o_auc_table[3][7] h2o_ovr_macro_aucpr = h2o_aucpr_table[3][7] h2o_default_auc = gbm.auc() h2o_default_aucpr = gbm.aucpr() print("default vs. table AUC "+str(h2o_ovr_macro_auc)+" "+str(h2o_default_auc)) print("default vs. table PR AUC "+str(h2o_ovr_macro_aucpr)+" "+str(h2o_default_aucpr))
def check_same(data1, data2, min_rows_scale): gbm1_regression = H2OGradientBoostingEstimator(min_rows=5, ntrees=5, max_depth=5) gbm1_regression.train(x=["displacement", "power", "weight", "acceleration", "year"], y="economy", training_frame=data1) gbm2_regression = H2OGradientBoostingEstimator(min_rows=5*min_rows_scale, ntrees=5, max_depth=5) gbm2_regression.train(x=["displacement", "power", "weight", "acceleration", "year", "weights"], y="economy", training_frame=data2, weights_column="weights") gbm1_binomial = H2OGradientBoostingEstimator(min_rows=5, distribution="bernoulli", ntrees=5, max_depth=5) gbm1_binomial.train(x=["displacement", "power", "weight", "acceleration", "year"], y="economy_20mpg", training_frame=data1) gbm2_binomial = H2OGradientBoostingEstimator(min_rows=5*min_rows_scale, distribution="bernoulli", ntrees=5, max_depth=5) gbm2_binomial.train(x=["displacement", "power", "weight", "acceleration", "year", "weights"], y="economy_20mpg", training_frame=data2, weights_column="weights") gbm1_multinomial = H2OGradientBoostingEstimator(min_rows=5, distribution="multinomial", ntrees=5, max_depth=5) gbm1_multinomial.train(x=["displacement", "power", "weight", "acceleration", "year"], y="cylinders", training_frame=data1) gbm2_multinomial = H2OGradientBoostingEstimator(min_rows=5*min_rows_scale, distribution="multinomial", ntrees=5, max_depth=5) gbm2_multinomial.train(x=["displacement", "power", "weight", "acceleration", "year", "weights"], y="cylinders", weights_column="weights", training_frame=data2) reg1_mse = gbm1_regression.mse() reg2_mse = gbm2_regression.mse() bin1_auc = gbm1_binomial.auc() bin2_auc = gbm2_binomial.auc() mul1_mse = gbm1_multinomial.mse() mul2_mse = gbm2_multinomial.mse() print("MSE (regresson) no weights vs. weights: {0}, {1}".format(reg1_mse, reg2_mse)) print("AUC (binomial) no weights vs. weights: {0}, {1}".format(bin1_auc, bin2_auc)) print("MSE (multinomial) no weights vs. weights: {0}, {1}".format(mul1_mse, mul2_mse)) assert abs(reg1_mse - reg2_mse) < 1e-5 * reg1_mse, "Expected mse's to be the same, but got {0}, and {1}".format(reg1_mse, reg2_mse) assert abs(bin1_auc - bin2_auc) < 3e-4 * bin1_auc, "Expected auc's to be the same, but got {0}, and {1}".format(bin1_auc, bin2_auc) assert abs(mul1_mse - mul1_mse) < 1e-6 * mul1_mse, "Expected auc's to be the same, but got {0}, and {1}".format(mul1_mse, mul2_mse)
rf_v1._model_json['output']['variable_importances'].as_data_frame() # In[27]: perf = rf_v1.model_performance(valid=True) perf.plot() # In[69]: rf_v1.r2() # ### Gradient Boost # In[61]: gbm1 = H2OGradientBoostingEstimator() gbm1.train(train_X, train_y, training_frame=train, validation_frame=val) # In[62]: pred = gbm1.predict(val[:, 1:-1]).as_data_frame().as_matrix()[:, -2:].ravel() true = pd.get_dummies( val[:, -1].as_data_frame().as_matrix().flatten()).values.ravel() print("AUC Score calculaed by sklearn") roc_auc_score(true, pred) # In[63]: gbm1.confusion_matrix(valid=True) # In[41]:
creditcard_df = h2o.import_file(os.path.realpath("input/creditcard.csv")) # 60% for training # 20% for validation (hyper parameter tuning) # 20% for final testing #split the data as described above train, valid, test = creditcard_df.split_frame([0.6, 0.2], seed=1234) #Prepare predictors and response columns creditcard_X = creditcard_df.col_names[: -1] #last column is Class, our desired response variable creditcard_y = creditcard_df.col_names[-1] gbm_v1 = H2OGradientBoostingEstimator(model_id="gbm_creditcard_v1", max_hit_ratio_k=3, seed=2000000) gbm_v1.train(creditcard_X, creditcard_y, training_frame=train, validation_frame=valid) gbm_v1.score_history() gbm_v1.hit_ratio_table(valid, train=FALSE, valid=FALSE, xval=FALSE) # This default GBM is much worse than our original random forest. # # # The GBM is far from converging, so there are three primary knobs to adjust to get our performance up if we want to keep a similar run time. #
#gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters) #gs.train(x=range(0, iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10) ## ## Pipeline ## from h2o.transforms.preprocessing import H2OScaler from h2o.transforms.decomposition import H2OPCA from sklearn.pipeline import Pipeline h2o.no_progress() pipeline = Pipeline([ ("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("gbm", H2OGradientBoostingEstimator(distribution="multinomial")) ]) print pipeline.fit(iris_df[:4], iris_df[4]) ## ## Randomized Gird Search ## from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer params = { "standardize__center": [True, False], "standardize__scale": [True, False],
("uploader", H2OFrameCreator()), ("classifier", classifier)]) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types=["categorical"])) pipeline.verify(audit_X.sample(frac=0.05, random_state=13)) classifier = pipeline._final_estimator store_mojo(classifier, name) store_pkl(pipeline, name) adjusted = pipeline.predict(audit_X) adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"]) store_csv(adjusted.as_data_frame(), name) if "Audit" in datasets and with_h2o: build_audit_h2o( H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=17), "H2OGradientBoostingAudit") build_audit_h2o(H2OGeneralizedLinearEstimator(family="binomial"), "H2OLogisticRegressionAudit") build_audit_h2o( H2ORandomForestEstimator(distribution="bernoulli", seed=13), "H2ORandomForestAudit") audit_dict_X = audit_X.to_dict("records") def build_audit_dict(classifier, name, with_proba=True): pipeline = PMMLPipeline([("dict-transformer", DictVectorizer()), ("classifier", classifier)]) pipeline.fit(audit_dict_X, audit_y) store_pkl(pipeline, name)
def stackedensemble_validation_frame_test(): """This test checks the following: 1) That passing in a validation_frame to h2o.stackedEnsemble does something (validation metrics exist). 2) It should hopefully produce a better model (in the metalearning step). """ # Import training set df = h2o.import_file(path=pyunit_utils.locate("smalldata/higgs/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file(path=pyunit_utils.locate("smalldata/higgs/higgs_test_5k.csv"), destination_frame="higgs_test_5k") # Identify predictors and response x = df.columns y = "response" x.remove(y) # Convert response to a factor df[y] = df[y].asfactor() test[y] = test[y].asfactor() # Split off a validation_frame ss = df.split_frame(seed = 1) train = ss[0] valid = ss[1] # Set number of folds nfolds = 5 # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=10, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) # Train a stacked ensemble & check that validation metrics are missing stack1 = H2OStackedEnsembleEstimator(base_models=[my_gbm.model_id, my_rf.model_id]) stack1.train(x=x, y=y, training_frame=train) assert(stack1.model_performance(valid=True) is None) # Train a stacked ensemble with a validation_frame & check that validation metrics exist & are correct type stack2 = H2OStackedEnsembleEstimator(base_models=[my_gbm.model_id, my_rf.model_id]) stack2.train(x=x, y=y, training_frame=train, validation_frame=valid) assert(type(stack2.model_performance(valid=True)) == h2o.model.metrics_base.H2OBinomialModelMetrics) assert(type(stack2.auc(valid=True)) == float) # Compare test AUC (ensemble with validation_frame should not be worse) perf1 = stack1.model_performance(test_data=test) perf2 = stack2.model_performance(test_data=test) assert perf2.auc() >= perf1.auc()
#Here we retrain base models before calling stack ensemble print("stacked") for b_model in base: if 'GLM' in b_model: #GLM is giving error with re-training base.remove(b_model) else: m = h2o.get_model(b_model) m.train(y=-1, training_frame=d) ensemble = H2OStackedEnsembleEstimator(base_models=base) ensemble.train(y=-1, training_frame=d) anytime_model = ensemble else: aml.leader.train(y=-1, training_frame=d) anytime_model = aml.leader # In[27]: from h2o.estimators.gbm import H2OGradientBoostingEstimator m = h2o.get_model('GBM_grid__1_AutoML_20200518_140119_model_4') print(m) # In[28]: m_new = H2OGradientBoostingEstimator(checkpoint=m_new) m_new.train(y=-1, training_frame=d) print(m_new)
def test_h2o_classifier_multi_2class(self): gbm = H2OGradientBoostingEstimator(ntrees=7, max_depth=5, distribution="multinomial") mojo_path, test_data = _train_classifier(gbm, 2, is_str=True) with self.assertRaises(ValueError) as err: _convert_mojo(mojo_path) self.assertRegexpMatches(err.exception.args[0], "not supported")
def mojo_predict_api_test(sandbox_dir): data = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) input_csv = "%s/in.csv" % sandbox_dir output_csv = "%s/prediction.csv" % sandbox_dir h2o.export_file(data[1, 2:], input_csv) data[1] = data[1].asfactor() model = H2OGradientBoostingEstimator(distribution="bernoulli") model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data) # download mojo model_zip_path = os.path.join(sandbox_dir, 'model.zip') genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar') download_mojo(model, model_zip_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) # test that we can predict using default paths h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) # test that we can predict using custom genmodel path other_sandbox_dir = tempfile.mkdtemp() try: genmodel_path = os.path.join(other_sandbox_dir, 'h2o-genmodel-custom.jar') download_mojo(model, model_zip_path, genmodel_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) try: h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir except RuntimeError: pass assert not os.path.isfile(output_csv) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(output_csv) output_csv = "%s/out.prediction" % other_sandbox_dir # test that we can predict using default paths h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True, output_csv_path=output_csv) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) finally: shutil.rmtree(other_sandbox_dir)
def partial_plot_test_with_user_splits(): data = h2o.import_file( pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv')) x = data.names y = 'CAPSULE' x.remove(y) # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) file, filename = tempfile.mkstemp(suffix=".png") user_splits = dict() user_splits['AGE'] = [ 43.0, 44.89473684210526, 46.78947368421053, 48.68421052631579, 50.578947368421055, 52.473684210526315, 54.368421052631575, 56.26315789473684, 58.1578947368421, 60.05263157894737, 61.94736842105263, 63.84210526315789, 65.73684210526315, 67.63157894736842, 69.52631578947368, 71.42105263157895, 73.3157894736842, 75.21052631578948, 77.10526315789474 ] user_splits['RACE'] = ["Black", "White"] pdpUserSplit2D = gbm_model.partial_plot(data=data, server=True, plot=True, user_splits=user_splits, col_pairs_2dpdp=[['AGE', 'PSA'], ['AGE', 'RACE']], save_to_file=filename) pdpUserSplit1D2D = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=True, user_splits=user_splits, col_pairs_2dpdp=[['AGE', 'PSA'], ['AGE', 'RACE']], save_to_file=filename) pdpUserSplit1D = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=True, user_splits=user_splits, save_to_file=filename) if os.path.isfile(filename): os.remove(filename) # compare results 1D pdp for i in range(3): pyunit_utils.assert_H2OTwoDimTable_equal_upto( pdpUserSplit1D[i], pdpUserSplit1D2D[i], pdpUserSplit1D[i].col_header, tolerance=1e-10) # compare results 2D pdp pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit2D[0], pdpUserSplit1D2D[3], pdpUserSplit2D[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit2D[1], pdpUserSplit1D2D[4], pdpUserSplit2D[1].col_header, tolerance=1e-10)