def test_infogram_iris_x_attributes(): """ Test to showcase that we can specify predictors using infogram model """ fr = h2o.import_file(path=pyunit_utils.locate( "smalldata/admissibleml_test/irisROriginal.csv")) target = "Species" fr[target] = fr[target].asfactor() x = fr.names x.remove(target) infogram_model = H2OInfogram( seed=12345, distribution='multinomial' ) # build infogram model with default settings infogram_model.train(x=x, y=target, training_frame=fr) glm_model1 = H2OGeneralizedLinearEstimator(family='multinomial') glm_model1.train(x=infogram_model._extract_x_from_model(), y=target, training_frame=fr) coef1 = glm_model1.coef() glm_model2 = H2OGeneralizedLinearEstimator(family='multinomial') glm_model2.train(x=infogram_model, y=target, training_frame=fr) coef2 = glm_model2.coef() coef_classes = coef1.keys() for key in coef_classes: pyunit_utils.assertCoefDictEqual(coef1[key], coef2[key], tol=1e-6)
def test_infogram_personal_loan(): """ Test to make sure predictor can be specified using infogram model. """ fr = h2o.import_file(path=pyunit_utils.locate( "smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv")) target = "Personal Loan" fr[target] = fr[target].asfactor() x = [ "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage", "Securities Account", "CD Account", "Online", "CreditCard" ] infogram_model = H2OInfogram(seed=12345, protected_columns=["Age", "ZIP Code"]) infogram_model.train(x=x, y=target, training_frame=fr) glm_model1 = H2OGeneralizedLinearEstimator() glm_model1.train(x=infogram_model._extract_x_from_model(), y=target, training_frame=fr) coef1 = glm_model1.coef() glm_model2 = H2OGeneralizedLinearEstimator() glm_model2.train(x=infogram_model, y=target, training_frame=fr) coef2 = glm_model2.coef() pyunit_utils.assertCoefDictEqual(coef1, coef2, tol=1e-6)
def test_maxrglm_gaussian_coefs(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] maxrglm_model = maxrglm(seed=12345, max_predictor_number=7) maxrglm_model.train(training_frame=d, x=my_x, y=my_y) coefs = maxrglm_model.coef() coefs_norm = maxrglm_model.coef_norm() for ind in list(range(len(coefs))): one_coef = coefs[ind] one_coef_norm = coefs_norm[ind] # coefficients obtained from accessing model_id, generate model and access the model coeffs one_model = h2o.get_model( maxrglm_model._model_json["output"]["best_model_ids"][ind]['name']) model_coef = one_model.coef() model_coef_norm = one_model.coef_norm() # get coefficients of individual predictor subset size subset_size = ind + 1 one_model_coef = maxrglm_model.coef(subset_size) one_model_coef_norm = maxrglm_model.coef_norm(subset_size) # check coefficient dicts are equal pyunit_utils.assertCoefDictEqual(one_coef, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_coef_norm, model_coef_norm, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm, 1e-6)
def test_GLM_RCC_warning(): warnNumber = 1 hdf = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) print("Testing for family: TWEEDIE") print("Set variables for h2o.") y = "CAPSULE" x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"] print("Create models with canonical link: TWEEDIE") buffer = StringIO() # redirect output sys.stderr = buffer model_h2o_tweedie = H2OGeneralizedLinearEstimator( family="tweedie", link="tweedie", alpha=0.5, Lambda=0.1, remove_collinear_columns=True, solver="coordinate_descent") model_h2o_tweedie.train( x=x, y=y, training_frame=hdf) # this should generate a warning message model_h2o_tweedie_wo_rcc = H2OGeneralizedLinearEstimator( family="tweedie", link="tweedie", alpha=0.5, Lambda=0.1, solver="coordinate_descent") sys.stderr = sys.__stderr__ # redirect printout back to normal path model_h2o_tweedie_wo_rcc.train( x=x, y=y, training_frame=hdf) # no warning message here. # since remove_collinear_columns have no effect, this two models should be the same pyunit_utils.assertCoefDictEqual(model_h2o_tweedie.coef(), model_h2o_tweedie_wo_rcc.coef()) # check and make sure we get the correct warning message warn_phrase = "remove_collinear_columns only works when IRLSM" try: # for python 2.7 assert len(buffer.buflist) == warnNumber print(buffer.buflist[0]) assert warn_phrase in buffer.buflist[0] except: # for python 3. warns = buffer.getvalue() print("*** captured warning message: {0}".format(warns)) assert warn_phrase in warns
def match_models(self): for model in self.manual_gam_models: alpha = model.actual_params['alpha'] lambda_ = model.actual_params['lambda'] scale = model.actual_params['scale'] num_knots = model.actual_params['num_knots'] for grid_search_model in self.h2o_model.models: if grid_search_model.actual_params['alpha'] == alpha \ and grid_search_model.actual_params['lambda'] == lambda_\ and grid_search_model.actual_params['scale'] == scale\ and grid_search_model.actual_params['num_knots'] == num_knots: self.num_grid_models += 1 pyunit_utils.assertCoefDictEqual(grid_search_model.coef(), model.coef()) break assert self.num_grid_models == self.num_expected_models, "Grid search model parameters incorrect or " \ "incorrect number of models generated"
def test_glm_beta_constraints_dict_megan(): df = h2o.import_file(pyunit_utils.locate("smalldata/kaggle/CreditCard/creditcard_train_cat.csv"), col_types={"DEFAULT_PAYMENT_NEXT_MONTH": "enum"}) lb_limit_bal = 0.0001 constraints = h2o.H2OFrame({'names':["LIMIT_BAL", "AGE"], 'lower_bounds': [lb_limit_bal, lb_limit_bal], 'upper_bounds': [1e6, 1e6]}) # make sure we have the column names in expected order, the backend does weird things when the order is different constraints = constraints[["names", "lower_bounds", "upper_bounds"]] glm_beta = H2OGeneralizedLinearEstimator(model_id="beta_glm", beta_constraints=constraints, seed=42) glm_beta.train(y="DEFAULT_PAYMENT_NEXT_MONTH", training_frame=df) glm_coeff = glm_beta.coef() assert glm_coeff["LIMIT_BAL"] >= lb_limit_bal or glm_coeff["LIMIT_BAL"]==0 # using dict for beta_constraints constraints2 = {"LIMIT_BAL":{"lower_bound":lb_limit_bal, "upper_bound":1e6}, "AGE":{"lower_bound":lb_limit_bal, "upper_bound":1e6}} glm_beta_dict = H2OGeneralizedLinearEstimator(model_id="beta_glm", beta_constraints=constraints2, seed=42) glm_beta_dict.train(y="DEFAULT_PAYMENT_NEXT_MONTH", training_frame=df) glm_coeff_dict = glm_beta_dict.coef() pyunit_utils.assertCoefDictEqual(glm_coeff, glm_coeff_dict, tol=1e-6) # coefficients should be the same from both runs print("test complete!")
def test_gam_dual_mode_multinomial(): train = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) train["C11"] = train["C11"].asfactor() train["C1"] = train["C1"].asfactor() train["C2"] = train["C2"].asfactor() test = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) test["C11"] = test["C11"].asfactor() test["C1"] = test["C1"].asfactor() test["C2"] = test["C2"].asfactor() x = ["C1", "C2"] y = "C11" gam_cols1 = ["C6", ["C7", "C8"], "C9", "C10"] gam_cols2 = [["C6"], ["C7", "C8"], ["C9"], ["C10"]] h2o_model1 = H2OGeneralizedAdditiveEstimator(family='multinomial', gam_columns=gam_cols1, bs=[1, 1, 0, 0], max_iterations=2) h2o_model1.train(x=x, y=y, training_frame=train, validation_frame=test) h2o_model2 = H2OGeneralizedAdditiveEstimator(family='multinomial', gam_columns=gam_cols2, bs=[1, 1, 0, 0], max_iterations=2) h2o_model2.train(x=x, y=y, training_frame=train, validation_frame=test) # check that both models produce the same coefficients print(h2o_model1.coef()) print(h2o_model2.coef()) pyunit_utils.assertCoefDictEqual(h2o_model1.coef()['coefficients'], h2o_model2.coef()['coefficients'], tol=1e-6) # check both models product the same validation metrics assert abs(h2o_model1.logloss(valid=True) - h2o_model2.logloss(valid=True)) < 1e-6,\ "Expected validation logloss: {0}, Actual validation logloss: {1}".format(h2o_model1.logloss(valid=True), h2o_model2.logloss(valid=True))
def test_gaussian_alpha(): col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_rmse", "validation_rmse", "training_mae", "validation_mae", "training_deviance", "validation_deviance"] h2o_data = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # test with lambda search on, generate_scoring_history on and off model1 = glm(family="gaussian", lambda_search=True, alpha=[0,0.2,1], generate_scoring_history=True, nlambdas=5) model1.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) model2 = glm(family="gaussian", lambda_search=True, alpha=[0,0.2,1], generate_scoring_history=False, nlambdas=5) model2.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef()) # test with lambda search off, generate_scoring_history on and off model1 = glm(family="gaussian", lambda_search=False, alpha=[0,0.8,1], generate_scoring_history=True, Lambda=[0,0.004]) model1.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) model2 = glm(family="gaussian", lambda_search=False, alpha=[0,0.8,1], generate_scoring_history=False, Lambda=[0,0.004]) model2.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef()) # test with lambda search on, generate_scoring_history on and off, cv on model1 = glm(family="gaussian", lambda_search=True, alpha=[0,0.8,1], generate_scoring_history=True, nfolds=2, seed=12345, nlambdas=5) model1.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) model2 = glm(family="gaussian", lambda_search=True, alpha=[0,0.8,1], generate_scoring_history=False, nfolds=2, seed=12345, nlambdas=5) model2.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef()) # test with lambda search off, generate_scoring_history on and off, cv on model1 = glm(family="gaussian", lambda_search=False, alpha=[0,0.2,1], generate_scoring_history=True, Lambda=[0,0.1], nfolds=2, seed=12345) model1.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) model2 = glm(family="gaussian", lambda_search=False, alpha=[0,0.2], generate_scoring_history=False, Lambda=[0,0.1], nfolds=2, seed=12345) model2.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())
def buildModelCheckCoeff(train_data, y, gamX, family): numKnots = [3,4,5] scale= [0.001, 0.001, 0.001] bs_type = [0,0,0] x=["C1","C2"] frames = train_data.split_frame(ratios=[0.9]) train_part = frames[0] test_part = frames[1] # building multiple models with same training / test datasets to make sure it works h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=scale, bs=bs_type, num_knots=numKnots) h2o_model.train(x=x, y=y, training_frame=train_part, validation_frame=test_part) h2o_model2 = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=scale, bs=bs_type, num_knots=numKnots) h2o_model2.train(x=x, y=y, training_frame=train_part, validation_frame=test_part) coef1 = h2o_model.coef() coef2 = h2o_model2.coef() if family=='multinomial': allKeys = coef1.keys() for oneKey in allKeys: pyunit_utils.assertCoefDictEqual(coef1[oneKey], coef2[oneKey]) else: pyunit_utils.assertCoefDictEqual(coef1, coef2)
def test_binomial_alpha(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10] # test with lambda search on, generate_scoring_history on and off model1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=True, generate_scoring_history=True) model1.train(x=X, y=Y, training_frame=training_data) model2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=True, generate_scoring_history=True) model2.train(x=X, y=Y, training_frame=training_data) pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef()) # test with lambda search off, generate_scoring_history on and off model1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=False, generate_scoring_history=True, Lambda=[0, 0.1, 0.01, 0.001]) model1.train(x=X, y=Y, training_frame=training_data) model2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=False, generate_scoring_history=True, Lambda=[0, 0.1, 0.01, 0.001]) model2.train(x=X, y=Y, training_frame=training_data) pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef()) # test with lambda search on, generate_scoring_history on and off, cv on model1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=True, generate_scoring_history=True, nfolds=2, seed=12345) model1.train(x=X, y=Y, training_frame=training_data) model2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=True, generate_scoring_history=True, nfolds=2, seed=12345) model2.train(x=X, y=Y, training_frame=training_data) pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef()) # test with lambda search off, generate_scoring_history on and off, cv on model1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=False, generate_scoring_history=True, nfolds=2, seed=12345, Lambda=[0, 0.1, 0.01, 0.001]) model1.train(x=X, y=Y, training_frame=training_data) model2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=False, generate_scoring_history=True, nfolds=2, seed=12345, Lambda=[0, 0.1, 0.01, 0.001]) model2.train(x=X, y=Y, training_frame=training_data) pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())
def test_modelselection_gaussian_coefs(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] allsubsets_model = modelSelection(seed=12345, max_predictor_number=7, mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) coefs_allsubsets = allsubsets_model.coef() coefs_norm_allsubsets = allsubsets_model.coef_norm() maxrsweep_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxrsweep") maxrsweep_model.train(training_frame=d, x=my_x, y=my_y) maxr_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) # make sure results returned by maxr and maxrsweep are the same pyunit_utils.compare_frames_local(maxr_model.result()[2:4], maxrsweep_model.result()[2:4], prob=1.0, tol=1e-6) coefs_maxr = maxr_model.coef() coefs_norm_maxr = maxr_model.coef_norm() for ind in list(range(len(coefs_allsubsets))): one_coef_allsubsets = coefs_allsubsets[ind] one_coef_norm_allsubsets = coefs_norm_allsubsets[ind] one_coef_maxr = coefs_maxr[ind] one_coef_norm_maxr = coefs_norm_maxr[ind] # coefficients obtained from accessing model_id, generate model and access the model coeffs one_model = h2o.get_model(allsubsets_model._model_json["output"] ["best_model_ids"][ind]['name']) model_coef = one_model.coef() model_coef_norm = one_model.coef_norm() # get coefficients of individual predictor subset size subset_size = ind + 1 one_model_coef = allsubsets_model.coef(subset_size) one_model_coef_norm = allsubsets_model.coef_norm(subset_size) # check coefficient dicts are equal pyunit_utils.assertCoefDictEqual(one_coef_allsubsets, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_coef_norm_allsubsets, model_coef_norm, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef, one_coef_maxr, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef_norm, one_coef_norm_maxr, 1e-6)
def test_gam_cv_fold_columns(): # create frame knots knots1 = [-1.99905699, -0.98143075, 0.02599159, 1.00770987, 1.99942290] frameKnots1 = h2o.H2OFrame(python_obj=knots1) knots2 = [-1.999821861, -1.005257990, -0.006716042, 1.002197392, 1.999073589] frameKnots2 = h2o.H2OFrame(python_obj=knots2) knots3 = [-1.999675688, -0.979893796, 0.007573327, 1.011437347, 1.999611676] frameKnots3 = h2o.H2OFrame(python_obj=knots3) # import the dataset h2o_data = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")) # convert the C1, C2, and C11 columns to factors h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() h2o_data["C11"] = h2o_data["C11"].asfactor() # split into train and validation sets train, test = h2o_data.split_frame(ratios=[.8]) # set the predictor and response columns y = "C11" x = ["C1", "C2"] # specify the knots array numKnots = [5, 5, 5] # Both of these gives an NPE, should be fixed now. # build the GAM model gam_columns=["C6","C7","C8"] h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial', gam_columns=["C6", "C7", "C8"], scale=[0, 1, 2], num_knots=numKnots, knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key], nfolds=5, seed=1234, fold_assignment='modulo') h2o_model.train(x=x, y=y, training_frame=train) # create a fold column for train fold_numbers = train.kfold_column(n_folds=5, seed=1234) # rename the column "fold_numbers" fold_numbers.set_names(["fold_numbers"]) train = train.cbind(fold_numbers) # build the GAM model h2o_model_fold_column = H2OGeneralizedAdditiveEstimator(family='multinomial', gam_columns=["C6", "C7", "C8"], scale=[0, 1, 2], num_knots=numKnots, knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key]) h2o_model_fold_column.train(x=x, y=y, training_frame=train, fold_column="fold_numbers") # both model should return the same coefficients since they use the same fold assignment coeff = h2o_model.coef() coeff_fold_column = h2o_model_fold_column.coef() pyunit_utils.assertCoefDictEqual(coeff['coefficients'], coeff_fold_column['coefficients'])