def run_print_model_performance(family, train, nfolds, bc_constraints, x, y, printText, seed, solver): print(printText) if bc_constraints is None: print("No beta constraints: Without lambda search and with solver {0}". format(solver)) h2o_model = H2OGeneralizedLinearEstimator( family=family, nfolds=nfolds, beta_constraints=bc_constraints, seed=seed, solver=solver) h2o_model.train(x=x, y=y, training_frame=train) print(h2o_model.model_performance(xval=True)) print("No beta constraints: With lambda search and with solver {0}". format(solver)) h2o_model2 = H2OGeneralizedLinearEstimator( family=family, nfolds=nfolds, beta_constraints=bc_constraints, seed=seed, lambda_search=True, solver=solver) h2o_model2.train(x=x, y=y, training_frame=train) print(h2o_model.model_performance(xval=True)) else: print("Without lambda search and with solver {0}".format(solver)) h2o_model = H2OGeneralizedLinearEstimator( family=family, nfolds=nfolds, beta_constraints=bc_constraints, seed=seed, solver=solver) h2o_model.train(x=x, y=y, training_frame=train) print(h2o_model.model_performance(xval=True)) print("With lambda search and with solver {0}".format(solver)) h2o_model2 = H2OGeneralizedLinearEstimator( family=family, nfolds=nfolds, beta_constraints=bc_constraints, seed=seed, lambda_search=True, solver=solver) h2o_model2.train(x=x, y=y, training_frame=train) print(h2o_model.model_performance(xval=True)) coeff = h2o_model.coef() coeff2 = h2o_model2.coef() colNames = bc_constraints["names"] lowerB = bc_constraints["lower_bounds"] upperB = bc_constraints["upper_bounds"] for count in range(0, len(colNames)): assert (coeff[colNames[count,0]] >= lowerB[count,0] and coeff[colNames[count,0]] <= upperB[count,0]) or \ coeff[colNames[count,0]]==0,\ "coefficient exceed limits" assert (coeff2[colNames[count,0]] >= lowerB[count,0] and coeff2[colNames[count,0]] <= upperB[count,0]) or\ coeff2[colNames[count,0]]==0, \ "coefficient exceed limits"
def algo_pr_auc_test(): ''' This pyunit test is written to make sure we can call pr_auc() on all binomial models. ''' seed = 123456789 prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() # Build H2O GBM classification model: gbm_h2o = H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.1, max_depth=4, min_rows=10, distribution="bernoulli", seed=seed) gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing GBM model") print(gbm_h2o) print("pr_auc for GBM model is {0}".format(gbm_h2o.pr_auc())) # Build H2O GLM classification model: glm_h2o = H2OGeneralizedLinearEstimator(family='binomial', seed=seed) glm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing GLM model") print(glm_h2o) # glm scoring history does not contain AUC, and hence no pr_auc print("pr_auc for GLM model is {0}".format(glm_h2o.pr_auc())) rf_h2o = H2ORandomForestEstimator(ntrees=10, score_tree_interval=0) rf_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing random forest model") print(rf_h2o) print("pr_auc for Random Forest model is {0}".format(rf_h2o.pr_auc())) dl_h2o = H2ODeepLearningEstimator(distribution='bernoulli', seed=seed, hidden=[2,2]) dl_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) print("*************************** Printing deeplearning model") print(dl_h2o) print("pr_auc for deeplearning model is {0}".format(dl_h2o.pr_auc())) assert abs(gbm_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \ "problem with pr_auc values" assert abs(rf_h2o.pr_auc()-dl_h2o.pr_auc()) < 0.9, \ "problem with pr_auc values" assert abs(rf_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \ "problem with pr_auc values" # try to call pr_auc() for regression. Should encounter error. h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) myY = "GLEASON" myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) try: print(h2o_model.pr_auc()) assert 1==2, "pr_auc() should raise an error for multinomial but did not." except: pass
def test_GLM_RCC_warning(): warnNumber = 1 hdf = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) print("Testing for family: TWEEDIE") print("Set variables for h2o.") y = "CAPSULE" x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"] print("Create models with lambda_search") buffer = StringIO() # redirect output sys.stderr = buffer model_h2o_tweedie = H2OGeneralizedLinearEstimator( family="tweedie", link="tweedie", lambda_search=True, remove_collinear_columns=True, solver="irlsm") model_h2o_tweedie.train( x=x, y=y, training_frame=hdf) # this should generate a warning message sys.stderr = sys.__stderr__ # redirect printout back to normal path # check and make sure we get the correct warning message warn_phrase = "It is used improperly here with lambda_search" try: # for python 2.7 assert len(buffer.buflist) == warnNumber print(buffer.buflist[0]) assert warn_phrase in buffer.buflist[0] except: # for python 3. warns = buffer.getvalue() print("*** captured warning message: {0}".format(warns)) assert warn_phrase in warns print("Create models with non-zero lambda") buffer = StringIO() # redirect output sys.stderr = buffer model_h2o_tweedie = H2OGeneralizedLinearEstimator( family="tweedie", link="tweedie", Lambda=0.01, remove_collinear_columns=True, solver="irlsm") model_h2o_tweedie.train( x=x, y=y, training_frame=hdf) # this should generate a warning message sys.stderr = sys.__stderr__ # redirect printout back to normal path # check and make sure we get the correct warning message warn_phrase = "It is used improperly here. Please set lambda=0" try: # for python 2.7 assert len(buffer.buflist) == warnNumber print(buffer.buflist[0]) assert warn_phrase in buffer.buflist[0] except: # for python 3. warns = buffer.getvalue() print("*** captured warning message: {0}".format(warns)) assert warn_phrase in warns
def run_print_model_performance(family, train, nfolds, bc_constraints, x, y, printText, seed, solver): print(printText) if bc_constraints is None: print("Without lambda search, solver = {0}".format(solver)) h2o_model = H2OGeneralizedLinearEstimator(family=family, nfolds=nfolds, seed=seed, solver=solver) h2o_model.train(x=x, y=y, training_frame=train) print(h2o_model.model_performance(xval=True)) print("With lambda search, solver = {0}".format(solver)) h2o_model2 = H2OGeneralizedLinearEstimator(family=family, nfolds=nfolds, seed=seed, lambda_search=True, solver=solver) h2o_model2.train(x=x, y=y, training_frame=train) print(h2o_model2.model_performance(xval=True)) else: print("Without lambda search, solver = {0}".format(solver)) h2o_model = H2OGeneralizedLinearEstimator( family=family, nfolds=nfolds, beta_constraints=bc_constraints, seed=seed, solver=solver) h2o_model.train(x=x, y=y, training_frame=train) print(h2o_model.model_performance(xval=True)) print("With lambda search, solver = {0}".format(solver)) h2o_model2 = H2OGeneralizedLinearEstimator( family=family, nfolds=nfolds, beta_constraints=bc_constraints, seed=seed, lambda_search=True, solver=solver) h2o_model2.train(x=x, y=y, training_frame=train) print(h2o_model2.model_performance(xval=True)) coeff = h2o_model.coef() coeff2 = h2o_model2.coef() colNames = bc_constraints["names"] lowerB = bc_constraints["lower_bounds"] upperB = bc_constraints["upper_bounds"] for count in range(0, len(colNames)): assert (coeff[colNames[count, 0]] >= lowerB[count, 0] and (coeff[colNames[count, 0]] < upperB[count, 0] or ( coeff[colNames[count, 0]] - upperB[count, 0]) < 1e-6)) \ or coeff[colNames[count, 0]] == 0, "coeff: {0}, lower limit: {1}, upper limit: " \ "{2}".format(coeff[colNames[count, 0]], lowerB[count, 0], upperB[count, 0]) assert (coeff2[colNames[count, 0]] >= lowerB[count, 0] and (coeff2[colNames[count, 0]] < upperB[count, 0] or ( coeff2[colNames[count, 0]] - upperB[count, 0]) < 1e-6)) or coeff2[colNames[count, 0]] == 0, \ "coeff: {0}, lower limit: {1}, upper limit: " \ "{2}".format(coeff2[colNames[count, 0]], lowerB[count, 0], upperB[count, 0])
def link_functions_poisson(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) sm_data = pd.read_csv( zipfile.ZipFile( pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip") ).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 9] sm_data_features = sm_data[:, 1:9] print("Testing for family: POISSON") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] print("Create h2o model with canonical link: LOG") h2o_model_log = H2OGeneralizedLinearEstimator(family="poisson", link="log", alpha=0.5, Lambda=0) h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data) print("Create statsmodel model with canonical link: LOG") sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Poisson( sm.families.links.log)).fit() print("Compare model deviances for link function log") h2o_deviance_log = old_div(h2o_model_log.residual_deviance(), h2o_model_log.null_deviance()) sm_deviance_log = old_div(sm_model_log.deviance, sm_model_log.null_deviance) assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures" print("Create h2o models with link: IDENTITY") h2o_model_id = H2OGeneralizedLinearEstimator(family="poisson", link="identity", alpha=0.5, Lambda=0) h2o_model_id.train(x=myX, y=myY, training_frame=h2o_data) print("Create statsmodel models with link: IDENTITY") sm_model_id = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Poisson( sm.families.links.identity)).fit() print("Compare model deviances for link function identity") h2o_deviance_id = old_div(h2o_model_id.residual_deviance(), h2o_model_id.null_deviance()) sm_deviance_id = old_div(sm_model_id.deviance, sm_model_id.null_deviance) assert h2o_deviance_id - sm_deviance_id < 0.01, "expected h2o to have an equivalent or better deviance measures"
def interactions_GLM_Binomial(): # test multiple interactions_GLM_Binomial enum by enum, enum by num and num by num all with NA terms print("******* Test interaction pairs") pd_df_NA = pd.DataFrame(np.array([[1,0,1,0,1,0], [1,2,4.2/2.2,4,3,1], [2,3,float('NaN'),1,2,3], ["a","a","a","b","a","b"], ['Foo','UNKNOWN','Foo','Foo','Foo','Bar']]).T, columns=['label','numerical_feat','numerical_feat2','categorical_feat', 'categorical_feat2']) h2o_df_NA = h2o.H2OFrame(pd_df_NA, na_strings=["UNKNOWN"]) pd_df = pd.DataFrame(np.array([[1,0,1,0,1,0], [1,2,4.2/2.2,4,3,1], [2,3,2.2,1,2,3], ["a","a","a","b","a","b"], ['Foo','Foo','Foo','Foo','Foo','Bar']]).T, columns=['label','numerical_feat','numerical_feat2','categorical_feat', 'categorical_feat2']) h2o_df = h2o.H2OFrame(pd_df, na_strings=["UNKNOWN"]) interaction_pairs = [("numerical_feat", "numerical_feat2"),("numerical_feat", "categorical_feat2"), ("categorical_feat", "categorical_feat2")] xcols = ['numerical_feat','numerical_feat2','categorical_feat','categorical_feat2'] # build model with and without NA in Frame modelNA = H2OGeneralizedLinearEstimator(family = "Binomial", alpha=0, lambda_search=False, interaction_pairs=interaction_pairs, standardize=False) modelNA.train(x=xcols, y='label', training_frame=h2o_df_NA) # build model with and without NA in Frame model = H2OGeneralizedLinearEstimator(family = "Binomial", alpha=0, lambda_search=False, interaction_pairs=interaction_pairs, standardize=False) model.train(x=xcols, y='label', training_frame=h2o_df) assert_arrays_equal_NA(modelNA._model_json['output']['coefficients_table'].cell_values, model._model_json['output']['coefficients_table'].cell_values) # test interaction of num and num columns print("******* Test interaction with num by num") pd_df_num_num_NA = pd.DataFrame(np.array([[1,0,1,0], [1,2,2,4], [2, 3, float('NaN'), 1]]).T, columns=['label', 'numerical_feat', 'numerical_feat2']) pd_df_num_num = pd.DataFrame(np.array([[1,0,1,0], [1,2,2,4], [2, 3, 2, 1]]).T, columns=['label', 'numerical_feat', 'numerical_feat2']) performOneTest(pd_df_num_num_NA, pd_df_num_num, interactionColumn= ['numerical_feat', 'numerical_feat2'], xcols=['numerical_feat', 'numerical_feat2'], standard=False) # test interaction of enum and enum columns print("******* Test interaction with enum by enum") pd_df_cat_cat_NA = pd.DataFrame(np.array([[1,0,1,0], ["a", "a", "b", "b"], ['Foo', 'UNKNOWN', 'Foo', 'Bar']]).T, columns=['label', 'categorical_feat', 'categorical_feat2']) pd_df_cat_cat = pd.DataFrame(np.array([[1,0,1,0], ["a", "a", "b", "b"], ['Foo', 'Foo', 'Foo', 'Bar']]).T, columns=['label', 'categorical_feat', 'categorical_feat2']) performOneTest(pd_df_cat_cat_NA, pd_df_cat_cat, interactionColumn= ['categorical_feat', 'categorical_feat2'], xcols=['categorical_feat', 'categorical_feat2']) # test interaction of enum and num columns print("******* Test interaction with enum by num") pd_df_cat_num_NA = pd.DataFrame(np.array([[1,0,1,0], [1,2,3,4], ['Foo', 'UNKNOWN', 'Foo', 'Bar']]).T, columns=['label', 'numerical_feat', 'categorical_feat']) pd_df_cat_num = pd.DataFrame(np.array([[1,0,1,0], [1,2,3,4], ['Foo', 'Foo', 'Foo', 'Bar']]).T, columns=['label', 'numerical_feat', 'categorical_feat']) performOneTest(pd_df_cat_num_NA, pd_df_cat_num, interactionColumn= ['numerical_feat', 'categorical_feat'], xcols=['numerical_feat', 'categorical_feat'])
def link_functions_negbinomial(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) sm_data = pd.read_csv( zipfile.ZipFile( pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip") ).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 9] sm_data_features = sm_data[:, 1:9] print("Testing for family: Negative Binomial") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] thetas = [0.000000001, 0.01, 0.1, 0.5, 1] for thetaO in thetas: print("Create statsmodel model with canonical link: LOG") sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.NegativeBinomial( sm.families.links.identity, thetaO)).fit() print("Create h2o model with canonical link: LOG") h2o_model_log = H2OGeneralizedLinearEstimator( family="negativebinomial", link="log", alpha=0.5, Lambda=0, theta=thetaO) h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data) print( "Comparing H2O model and Python model with log link and theta={0}". format(thetaO)) compareModels(h2o_model_log, sm_model_log) print("Create statsmodel model with canonical link: identity") sm_model_identity = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.NegativeBinomial( sm.families.links.log, thetaO)).fit() print("Create h2o model with canonical link: identity") h2o_model_identity = H2OGeneralizedLinearEstimator( family="negativebinomial", link="identity", alpha=0.5, Lambda=0, theta=thetaO) h2o_model_identity.train(x=myX, y=myY, training_frame=h2o_data) print( "Comparing H2O model and Python model with identity link and theta = " .format(thetaO)) compareModels(h2o_model_identity, sm_model_identity)
def test_relevel(): #First, compare againts itself print("Importing prostate_cat.csv data...\n") d = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"]) mh2o1 = H2OGeneralizedLinearEstimator(family="binomial", Lambda=0, missing_values_handling="Skip") mh2o1.train(x=list(range(1, d.ncol)), y=0, training_frame=d) ns = mh2o1.coef().keys() print(ns) assert ("DPROS.None" in ns, "None level IS NOT expected to be skipped by default") assert (("DPROS.Both" not in ns), "Both level IS expected to be skipped by default") x = d["DPROS"].relevel("None") print(x) d["DPROS"] = x[0] mh2o2 = H2OGeneralizedLinearEstimator(family="binomial", Lambda=0, missing_values_handling="Skip") mh2o2.train(x=list(range(1, d.ncol)), y=0, training_frame=d) ns2 = mh2o2.coef().keys() print(ns2) assert ("DPROS.None" in ns2, "None level IS NOT expected to be skipped by default") assert (("DPROS.Both" not in ns2), "Both level IS expected to be skipped by default") #Second, compare against R input (taken from runit_relevel.R) dr = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) dr["DPROS"] = d["DPROS"].relevel("None") #Results are from R but manualy reordered and renamed to match h2o naming and order exp_coefs = { "Intercept": -7.63245, "DPROS.Both": 1.39185, "DPROS.Left": 0.73482, "DPROS.Right": 1.51437, "RACE.White": 0.65160, "DCAPS.Yes": 0.49233, "AGE": -0.01189, "PSA": 0.02990, "VOL": -0.01141, "GLEASON": 0.96466927 } coeff_diff = { key: abs(exp_coefs[key] - mh2o2.coef().get(key, 0)) for key in exp_coefs.keys() } assert (max(coeff_diff.values()) < 1e-4)
def test_lambda_warning(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv")) Y = 13 X = list(range(13)) model = H2OGeneralizedLinearEstimator(family="Gaussian", lambda_search=True, Lambda=[0.01]) model.train(x=X, y=Y, training_frame=training_data) with pyunit_utils.catch_warnings() as ws: model = H2OGeneralizedLinearEstimator(family="Gaussian", lambda_search=True, Lambda=[0.01]) model.train(x=X, y=Y, training_frame=training_data) assert pyunit_utils.contains_warning(ws, 'disabled when user specified any lambda value(s)')
def link_functions_gamma(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv( zipfile.ZipFile( pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip") ).open("prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 5] sm_data_features = sm_data[:, [1, 2, 3, 4, 6, 7, 8, 9]] print("Testing for family: GAMMA") print("Set variables for h2o.") myY = "DPROS" myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "CAPSULE"] print("Create models with canonical link: INVERSE") h2o_model_in = H2OGeneralizedLinearEstimator(family="gamma", link="inverse", alpha=0.5, Lambda=0) h2o_model_in.train(x=myX, y=myY, training_frame=h2o_data) sm_model_in = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma( sm.families.links.inverse_power)).fit() print("Compare model deviances for link function inverse") h2o_deviance_in = old_div(h2o_model_in.residual_deviance(), h2o_model_in.null_deviance()) sm_deviance_in = old_div(sm_model_in.deviance, sm_model_in.null_deviance) assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures" print("Create models with canonical link: LOG") h2o_model_log = H2OGeneralizedLinearEstimator(family="gamma", link="log", alpha=0.5, Lambda=0) h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data) sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma( sm.families.links.log)).fit() print("Compare model deviances for link function log") h2o_deviance_log = old_div(h2o_model_log.residual_deviance(), h2o_model_log.null_deviance()) sm_deviance_log = old_div(sm_model_log.deviance, sm_model_log.null_deviance) assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
def buildModelCheckStdCoeffs(training_fileName, family): training_data = h2o.import_file(pyunit_utils.locate(training_fileName)) ncols = training_data.ncols Y = ncols - 1 x = list(range(0, Y)) enumCols = Y / 2 if family == 'binomial' or family == 'multinomial': training_data[Y] = training_data[Y].asfactor() # for ind in range(int(enumCols)): # first half of the columns are enums training_data[ind] = training_data[ind].asfactor() model1 = H2OGeneralizedLinearEstimator(family=family, standardize=True) model1.train(training_frame=training_data, x=x, y=Y) stdCoeff1 = model1.coef_norm() modelNS = H2OGeneralizedLinearEstimator(family=family, standardize=False) modelNS.train(training_frame=training_data, x=x, y=Y) coeffNSStandardized = modelNS.coef_norm() coeffNS = modelNS.coef() if family == 'multinomial': nclass = len(coeffNS) for cind in range(nclass): coeff1PerClass = coeffNSStandardized["std_coefs_class_" + str(cind)] coeff2PerClass = coeffNS["coefs_class_" + str(cind)] print("Comparing multinomial coefficients for class {0}".format( cind)) assert_coeffs_equal(coeff1PerClass, coeff2PerClass, training_data) else: # for binomial and gaussian assert_coeffs_equal(coeffNSStandardized, coeffNS, training_data) # standardize numerical columns here for ind in range( int(enumCols), Y): # change the numerical columns to have mean 0 and std 1 aver = training_data[ind].mean() sigma = 1.0 / math.sqrt(training_data[ind].var()) training_data[ind] = (training_data[ind] - aver) * sigma model2 = H2OGeneralizedLinearEstimator(family=family, standardize=False) model2.train(training_frame=training_data, x=x, y=Y) coeff2 = model2.coef_norm() compare_coeffs_2_model( family, stdCoeff1, coeff2 ) # make sure standardized coefficients from model 1 and 2 are the same # this part of the test is to check and make sure the changes I made int coef() and coef_norm() accurately # capture the correct coefficients. coeff2Coef = model2.coef( ) # = coeff2 since training data are standardized already compare_coeffs_2_model( family, coeff2, coeff2Coef, sameModel=True ) # make sure coefficients from coef_norm and coef are the same
def shuffling_large(): print("Reading in Arcene training data for binomial modeling.") train_data = h2o.upload_file(path=pyunit_utils.locate( "smalldata/arcene/shuffle_test_version/arcene.csv")) train_data_shuffled = h2o.upload_file(path=pyunit_utils.locate( "smalldata/arcene/shuffle_test_version/arcene_shuffled.csv")) print("Create model on original Arcene dataset.") h2o_model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model.train(x=range(1000), y=1000, training_frame=train_data) print("Create second model on original Arcene dataset.") h2o_model_2 = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model_2.train(x=range(1000), y=1000, training_frame=train_data) print("Create model on shuffled Arcene dataset.") h2o_model_s = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model_s.train(x=range(1000), y=1000, training_frame=train_data_shuffled) print( "Assert that number of predictors remaining and their respective coefficients are equal." ) for x, y in zip( h2o_model._model_json['output']['coefficients_table'].cell_values, h2o_model_2._model_json['output'] ['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type( y[2])), "coefficients should be the same type" if isinstance(x[1], float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2], float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal" for x, y in zip( h2o_model._model_json['output']['coefficients_table'].cell_values, h2o_model_s._model_json['output'] ['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type( y[2])), "coefficients should be the same type" if isinstance(x[1], float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2], float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def test_HGLM_R(): h2o_data = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/semiconductor.csv")) y = "y" x = ["x1", "x3", "x5", "x6"] z = [0] tot = 1e-4 h2o_data[0] = h2o_data[0].asfactor() start_vals = [ 0.001929687, 0.002817188, -0.001707812, -0.003889062, 0.010685937, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0.1 ] h2o_glm = H2OGeneralizedLinearEstimator(HGLM=True, family="gaussian", rand_family=["gaussian"], random_columns=z, calc_like=True) h2o_glm.train(x=x, y=y, training_frame=h2o_data) modelMetrics = h2o_glm.training_model_metrics() h2o_glm_start_val = H2OGeneralizedLinearEstimator(HGLM=True, family="gaussian", rand_family=["gaussian"], random_columns=z, calc_like=True, startval=start_vals) h2o_glm_start_val.train(x=x, y=y, training_frame=h2o_data) modelMetricsSV = h2o_glm_start_val.training_model_metrics() # compare model metrics from both models and they should be the same metricsNames = [ "hlik", "pvh", "dfrefe", "varfix", "pbvh", "convergence", "caic", "sumetadiffsquare" ] metricsNamesArrays = [ "summvc1", "sefe", "varranef", "ranef", "sere", "fixef", ] for ind in range(len(metricsNames)): assert abs(modelMetrics[metricsNames[ind]]-modelMetricsSV[metricsNames[ind]]) < tot, "expected {0}: {1}, " \ "actual {0}: {2}".format(metricsNames[ind], modelMetrics[metricsNames[ind]], modelMetricsSV[metricsNames[ind]]) for ind in range(len(metricsNamesArrays)): pyunit_utils.equal_two_arrays(modelMetrics[metricsNamesArrays[ind]], modelMetricsSV[metricsNamesArrays[ind]], 1e-10, tot)
def link_correct_default(): print("Reading in original prostate data.") h2o_data = h2o.upload_file( path=pyunit_utils.locate("smalldata/prostate/prostate.csv.zip")) print("Compare models with link unspecified and canonical link specified.") print("GAUSSIAN: ") h2o_model_unspecified = H2OGeneralizedLinearEstimator(family="gaussian") h2o_model_unspecified.train(x=list(range(1, 8)), y=8, training_frame=h2o_data) h2o_model_specified = H2OGeneralizedLinearEstimator(family="gaussian", link="identity") h2o_model_specified.train(x=list(range(1, 8)), y=8, training_frame=h2o_data) assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("BINOMIAL: ") h2o_model_unspecified = H2OGeneralizedLinearEstimator(family="binomial") h2o_model_unspecified.train(x=list(range(2, 9)), y=1, training_frame=h2o_data) h2o_model_specified = H2OGeneralizedLinearEstimator(family="binomial", link="logit") h2o_model_specified.train(x=list(range(2, 9)), y=1, training_frame=h2o_data) assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("POISSON: ") h2o_model_unspecified = H2OGeneralizedLinearEstimator(family="poisson") h2o_model_unspecified.train(x=list(range(2, 9)), y=1, training_frame=h2o_data) h2o_model_specified = H2OGeneralizedLinearEstimator(family="poisson", link="log") h2o_model_specified.train(x=list(range(2, 9)), y=1, training_frame=h2o_data) assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal" print("GAMMA: ") h2o_model_unspecified = H2OGeneralizedLinearEstimator(family="gamma") h2o_model_unspecified.train(x=list(range(3, 9)), y=2, training_frame=h2o_data) h2o_model_specified = H2OGeneralizedLinearEstimator(family="gamma", link="inverse") h2o_model_specified.train(x=list(range(3, 9)), y=2, training_frame=h2o_data) assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \ h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"
def buildModelCheckStdCoeffs(training_fileName, family): training_data = h2o.import_file(pyunit_utils.locate(training_fileName)) ncols = training_data.ncols Y = ncols - 1 x = list(range(0, Y)) enumCols = Y / 2 if family == 'binomial' or family == 'multinomial': training_data[Y] = training_data[Y].asfactor() # for ind in range(int(enumCols)): # first half of the columns are enums training_data[ind] = training_data[ind].asfactor() model1 = H2OGeneralizedLinearEstimator(family=family, standardize=True) model1.train(training_frame=training_data, x=x, y=Y) stdCoeff1 = model1.coef_norm() modelNS = H2OGeneralizedLinearEstimator(family=family, standardize=False) modelNS.train(training_frame=training_data, x=x, y=Y) coeffNSStandardized = modelNS.coef_norm() coeffNS = modelNS.coef() if family == 'multinomial': nclass = len(coeffNS) for cind in range(nclass): coeff1PerClass = coeffNSStandardized["std_coefs_class_" + str(cind)] coeff2PerClass = coeffNS["coefs_class_" + str(cind)] print("Comparing multinomial coefficients for class {0}".format( cind)) assert_coeffs_equal(coeff1PerClass, coeff2PerClass, training_data) else: # for binomial and gaussian assert_coeffs_equal(coeffNSStandardized, coeffNS, training_data) # standardize numerical columns here for ind in range( int(enumCols), Y): # change the numerical columns to have mean 0 and std 1 aver = training_data[ind].mean() sigma = 1.0 / math.sqrt(training_data[ind].var()) training_data[ind] = (training_data[ind] - aver) * sigma model2 = H2OGeneralizedLinearEstimator(family=family, standardize=False) model2.train(training_frame=training_data, x=x, y=Y) coeff2 = model2.coef_norm() if family == 'multinomial': # special treatment, it contains a dict of dict assert len(stdCoeff1) == len(coeff2), "Coefficient dictionary lengths are different. One has length {0} while" \ " the other one has length {1}.".format(len(stdCoeff1), len(coeff2)) for name in stdCoeff1.keys(): pyunit_utils.equal_two_dicts(stdCoeff1[name], coeff2[name]) else: pyunit_utils.equal_two_dicts(stdCoeff1, stdCoeff1)
def interactions(): df = h2o.import_file(pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip")) XY = [df.names[i-1] for i in [1,2,3,4,6,8,9,13,17,18,19,31]] interactions = [XY[i-1] for i in [5,7,9]] assert interactions == ["CRSDepTime", "UniqueCarrier", "Origin"] m = H2OGeneralizedLinearEstimator(lambda_search=True, family="binomial", interactions=interactions) m.train(x=XY[:len(XY)], y=XY[-1],training_frame=df) coef_m = m._model_json['output']['coefficients_table'] interaction_pairs = [("CRSDepTime", "UniqueCarrier"), ("CRSDepTime", "Origin"), ("UniqueCarrier", "Origin")] mexp = H2OGeneralizedLinearEstimator(lambda_search=True, family="binomial", interaction_pairs=interaction_pairs) mexp.train(x=XY[:len(XY)], y=XY[-1],training_frame=df) coef_mexp = mexp._model_json['output']['coefficients_table'] assert coef_m["names"] == coef_mexp["names"]
def link_functions_tweedie_vpow(): # Load example data from HDtweedie, y = aggregate claim loss hdf = h2o.upload_file(pyunit_utils.locate("smalldata/glm_test/auto.csv")) y = "y" x = list(set(hdf.names) - set(["y"])) print("Testing for family: TWEEDIE") print("Create models with canonical link: TWEEDIE") # Iterate over different variance powers for tweedie vpower = [0, 1, 1.5] r_dev = [0.7516627, 0.6708826, 0.7733762] r_null = [221051.88369951, 32296.29783702, 20229.47425307] for ridx, vpow in enumerate(vpower): print("Fit h2o.glm:") h2ofit = H2OGeneralizedLinearEstimator(family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow, alpha=0.5, Lambda=0) h2ofit.train(x=x,y=y, training_frame=hdf) print("Testing Tweedie variance power: {0}".format(vpow)) print("Compare model deviances for link function tweedie") deviance_h2o_tweedie = old_div(h2ofit.residual_deviance(), h2ofit.null_deviance()) assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \ "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx]) print("compare null and residual deviance between R glm and h2o.glm for tweedie") assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \ "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def test_glm_backward_compare(): tst_data = h2o.import_file( pyunit_utils.locate( "bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv")) predictors = tst_data.columns[0:-1] response_col = 'response' weight = 'wt' tst_data['wt'] = 1 tst_data[tst_data['response'] == 1, 'wt'] = 100 tst_data['response'] = tst_data['response'].asfactor() min_predictor_num = 200 backward_model = H2OModelSelectionEstimator( family='binomial', weights_column=weight, mode='backward', min_predictor_number=min_predictor_num) backward_model.train(predictors, response_col, training_frame=tst_data) backward_model_coeff = backward_model.coef()[0] glm_model = H2OGeneralizedLinearEstimator(family='binomial', lambda_=0, compute_p_values=True, weights_column=weight) glm_model.train(predictors, response_col, training_frame=tst_data) glm_coeff = glm_model.coef() pyunit_utils.assertEqualCoeffDicts(glm_coeff, backward_model_coeff, tol=1e-6)
def toy_classifications(): # train, valid, test = fr.split_frame([0.6, 0.2], seed=1234) # simply subsets of fr train, valid, test = divide_train_test(fr) m = H2OGeneralizedLinearEstimator(family="binomial") features = ['VatRatio','LocalVatRatio','TurnoverGross','TotalReturnCount','RefundClaimedBoolean'] + share_cols m.train(x=features, y="y", training_frame=train) m.confusion_matrix() # or m.model_performance() or simply m # m = H2ODeepLearningEstimator() m.train(x=features, y="y", training_frame=train, validation_frame=valid) m.confusion_matrix(valid=True) plt.plot(*m.roc(valid=1)) # m.model_performance(test_data=test) # Random Forest var_y = 'y' rf_v1 = H2ORandomForestEstimator( model_id="rf_v1", ntrees=200, stopping_rounds=2, score_each_iteration=True, seed=1000000) rf_v1.train(features, var_y, training_frame=train, validation_frame=valid) rf_v1.confusion_matrix(valid=1) # plt.plot(*rf_v1.roc(valid=1)) plot_betas(rf_v1.roc(valid=1))
def glm_solvers(): training_data = h2o.import_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) predictors = ["displacement", "power", "weight", "acceleration", "year"] for solver in [ "AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT" ]: print "Solver = {0}".format(solver) for family in ["binomial", "gaussian", "poisson", "tweedie", "gamma"]: if family == 'binomial': response_col = "economy_20mpg" elif family == 'gaussian': response_col = "economy" else: response_col = "cylinders" print "Family = {0}".format(family) if family == 'binomial': training_data[response_col] = training_data[ response_col].asfactor() else: training_data[response_col] = training_data[ response_col].asnumeric() model = H2OGeneralizedLinearEstimator(family=family, alpha=0, Lambda=1e-5, solver=solver) model.train(x=predictors, y=response_col, training_frame=training_data)
def test_HGLM_R(): tot = 1e-6 h2o_data = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/HGLM_5KRows_100Z.csv"), col_types=[ "enum", "enum", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ]) y = "response" x = ["enum1", "enum2", "enum3", "num1", "num2", "num3"] z = 0 h2o_glm = H2OGeneralizedLinearEstimator(HGLM=True, family="gaussian", rand_family=["gaussian"], random_columns=[z], calc_like=True) h2o_glm.train(x=x, y=y, training_frame=h2o_data) modelMetrics = h2o_glm.training_model_metrics() rmodelMetrics = { "hlik": -23643.3076231, "caic": 47019.7968491, "pvh": -23491.5738429, "pbvh": -23490.2982034, "dfrefe": 4953.0, "varfix": 703.86912057 } metricsNames = ["hlik", "caic", "pvh", "pbvh", "dfrefe", "varfix"] for kNames in metricsNames: assert abs(rmodelMetrics[kNames]-modelMetrics[kNames])<tot,"for {2}, Expected from R: {0}, actual from H2O-3: " \ "{1}".format(rmodelMetrics[kNames], modelMetrics[kNames], kNames)
def predict_from_standalone_lr(self, train, test, valid, x, y, prediction_field_name): """Produces an H2O dataframe containing a field with predictions from logistic regression model. :param train: the training H2O dataframe :param test: the testing H2O dataframe :param valid: the validation H2O dataframe :param x: the feature variables :param y: the target variable :param prediction_field_name: the name to use for field to contain predictions :returns: the H2O dataframe with prediction field and all fields from the supplied dataframe """ print("Logistic Regression") lr_standalone = H2OGeneralizedLinearEstimator(model_id='glm_v1', family='binomial', link='logit', solver='L_BFGS') lr_standalone.train(x=x, y=y, training_frame=train, validation_frame=valid) print("train[y].levels():", train[y].levels()[0]) y_level_count = train[y].nlevels()[0] print("y_level_count:", y_level_count) print("AUC (training):", lr_standalone.auc(train=True)) print("AUC (validation):", lr_standalone.auc(valid=True)) lr_standalone_predictions = lr_standalone.predict(test) print("Logistic Regression Predictions:") print(lr_standalone_predictions.head(rows=5)) return (self.set_prediction_field_name(lr_standalone_predictions, prediction_field_name))
def test_prostate(): h2o_data = h2o.upload_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) h2o_data.summary() sm_data = pd.read_csv( pyunit_utils.locate("smalldata/logreg/prostate.csv")).as_matrix() sm_data_response = sm_data[:, 1] sm_data_features = sm_data[:, 2:] h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", nfolds=10, alpha=0.5) h2o_glm.train(x=list(range(2, h2o_data.ncol)), y=1, training_frame=h2o_data) sm_glm = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial()).fit() print("statsmodels null deviance {0}".format(sm_glm.null_deviance)) print("h2o null deviance {0}".format(h2o_glm.null_deviance())) assert abs(sm_glm.null_deviance - h2o_glm.null_deviance() ) < 1e-5, "Expected null deviances to be the same"
def pyunit_make_glm_model(): # TODO: PUBDEV-1717 pros = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) model = H2OGeneralizedLinearEstimator(family="gaussian", alpha=[0]) model.train(x=["AGE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=pros) # model = h2o.glm(x=pros[["AGE","DPROS","DCAPS","PSA","VOL","GLEASON"]], y=pros["CAPSULE"], family="gaussian", alpha=[0]) new_betas = { "AGE": 0.5, "DPROS": 0.5, "DCAPS": 0.5, "PSA": 0.5, "VOL": 0.5, "GLEASON": 0.5 } names = '[' for n in list(new_betas.keys()): names += "\"" + n + "\"," names = names[0:len(names) - 1] + "]" betas = '[' for b in list(new_betas.values()): betas += str(b) + "," betas = betas[0:len(betas) - 1] + "]" res = h2o.H2OConnection.post_json("MakeGLMModel", model=model._id, names=names, beta=betas)
def save_load_model(): prostate = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5]) prostate_glm.train(x=["AGE", "RACE", "PSA", "DCAPS"], y="CAPSULE", training_frame=prostate) path = pyunit_utils.locate("results") assert os.path.isdir( path), "Expected save directory {0} to exist, but it does not.".format( path) model_path = h2o.save_model(prostate_glm, path=path, force=True) assert os.path.isfile( model_path ), "Expected load file {0} to exist, but it does not.".format(model_path) the_model = h2o.load_model(model_path) assert isinstance( the_model, H2OEstimator), "Expected and H2OBinomialModel, but got {0}".format( the_model)
def std_coef_plot_test(): kwargs = {} kwargs['server'] = True # import data set cars = h2o.import_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) # Constructing validation and train sets by sampling (20/80) s = cars[0].runif() cars_train = cars[s <= 0.8] cars_valid = cars[s > 0.8] # set list of features, target, and convert target to factor predictors = ["displacement", "power", "weight", "acceleration", "year"] response_col = "economy_20mpg" cars[response_col] = cars[response_col].asfactor() # Build and train a GLM model cars_glm = H2OGeneralizedLinearEstimator() cars_glm.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid) # Plot GLM standardized coefficient magnitudes and check that num_of_features accepts input cars_glm.std_coef_plot(server=True) cars_glm.std_coef_plot(num_of_features=2, server=True)
def h2oapi(): """ Python API test: h2o.api(endpoint, data=None, json=None, filename=None, save_to=None) """ try: training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10] model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5) model.train(x=X, y=Y, training_frame=training_data) frame_api = h2o.api("GET /3/Frames/%s/summary" % training_data.frame_id) assert_is_type(frame_api, H2OResponse) hf_col_summary = h2o.api("GET /3/Frames/%s/summary" % training_data.frame_id)["frames"][0] # test h2o.api() getting frame information assert hf_col_summary["row_count"]==100, "row count is incorrect. Fix h2o.api()." assert hf_col_summary["column_count"]==14, "column count is incorrect. Fix h2o.api()." # test h2o.api() getting model information model_api = h2o.api("GET /3/GetGLMRegPath", data={"model": model._model_json["model_id"]["name"]}) assert_is_type(model_api, H2OResponse) model_coefficients = model_api["coefficients"][0] assert len(model_coefficients)==11, "Number of coefficients is wrong. h2o.api() command is not working." except Exception as e: assert False, "h2o.api() command not is working."
def test_hdfs_io(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv") print("Spliting data") for c in ["Month","DayofMonth","IsArrDelayed"]: h2o_data[c] = h2o_data[c].asfactor() myX = ["Month","DayofMonth","Distance"] train,test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv") assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def _init_model(args): from h2o.estimators.glm import H2OGeneralizedLinearEstimator return H2OGeneralizedLinearEstimator( nfolds=args.n_folds, family="binomial", lambda_search=False, seed=args.random_seed)
def h2odownload_pojo(): """ Python API test: h2o.download_pojo(model, path=u'', get_jar=True) Copied from glm_download_pojo.py """ try: h2o_df = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor() binomial_fit = H2OGeneralizedLinearEstimator(family="binomial") binomial_fit.train(y="CAPSULE", x=["AGE", "RACE", "PSA", "GLEASON"], training_frame=h2o_df) try: results_dir = pyunit_utils.locate( "results") # find directory path to results folder h2o.download_pojo(binomial_fit, path=results_dir) assert os.path.isfile(os.path.join(results_dir, "h2o-genmodel.jar")), "h2o.download_pojo() " \ "command is not working." except: h2o.download_pojo( binomial_fit ) # just print pojo to screen if directory does not exists except Exception as e: assert False, "h2o.download_pojo() command is not working."