def glm_alpha_lambda_arrays_cv(): print("Testing glm cross-validation with alpha array, lambda array for binomial models.") h2o_data = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # choices made in model_all and model_xval should be the same since they should be using xval metrics model_all = glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True) model_all.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) model_all_rpath = glm.getGLMRegularizationPath(model_all) model_xval = glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True) model_xval.train(x=myX, y=myY, training_frame = training_data) model_xval_rpath = glm.getGLMRegularizationPath(model_xval) for l in range(0,len(model_all_rpath['lambdas'])): print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients'][l], model_xval_rpath['coefficients'][l], tol=1e-6) pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients_std'][l], model_xval_rpath['coefficients_std'][l], tol=1e-6)
def reg_path_glm(): # read in the dataset and construct training set (and validation set) d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) m = glm(family='binomial',lambda_search=True,solver='COORDINATE_DESCENT') m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m,coefs=r['coefficients'][10]) dev1 = r['explained_deviance_train'][10] p = m2.model_performance(d) dev2 = 1-p.residual_deviance()/p.null_deviance() assert abs(dev1 - dev2) < 1e-6 for l in range(0,len(r['lambdas'])): m = glm(family='binomial',lambda_search=False,Lambda=r['lambdas'][l],solver='COORDINATE_DESCENT') m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] diff = 0 diff2 = 0 for n in cs.keys(): diff = max(diff,abs((cs[n] - m.coef()[n]))) diff2 = max(diff2,abs((cs_norm[n] - m.coef_norm()[n]))) print(diff) print(diff2) assert diff < 1e-2 assert diff2 < 1e-2 p = m.model_performance(d) devm = 1-p.residual_deviance()/p.null_deviance() devn = r['explained_deviance_train'][l] print(devm) print(devn) assert abs(devm - devn) < 1e-4
def testGLMGaussianScoringHistory(): col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_rmse", "validation_rmse", "training_mae", "validation_mae", "training_deviance", "validation_deviance"] h2o_data = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # build gaussian model with score_each_interval to true model = glm(family="gaussian", score_each_iteration=True) model.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) # build gaussian model with score_iteration_interval to 1 model_score_each = glm(family="gaussian", score_iteration_interval=1) model_score_each.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assert_equal_scoring_history(model, model_score_each, col_list_compare) # build gaussian model with score_each_interval to true, with CV model_cv = glm(family="gaussian", score_each_iteration=True, nfolds=3, fold_assignment='modulo', seed=1234) model_cv.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) # build gaussian model with score_iteration_interval to 1, with CV model_score_each_cv = glm(family="gaussian", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234) model_score_each_cv.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assert_equal_scoring_history(model_cv, model_score_each_cv, col_list_compare) model_cv_4th = glm(family="gaussian", score_iteration_interval=4, nfolds=3, fold_assignment='modulo', seed=1234) model_cv_4th.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assertEqualScoringHistoryIteration(model_cv_4th, model_cv, col_list_compare)
def glm_alpha_array_with_lambda_search_cv(): # read in the dataset and construct training set (and validation set) print("Testing glm cross-validation with alpha array, lambda_search for multiomial models.") h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # build model with CV but no validation dataset cv_model = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3) cv_model.train(training_frame=training_data,x=myX,y=myY) cv_r = glm.getGLMRegularizationPath(cv_model) # build model with CV and with validation dataset cv_model_valid = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3) cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY) cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid) for l in range(0,len(cv_r['lambdas'])): print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)
def glm_alpha_lambda_arrays(): # compare coefficients and deviance when only training dataset is available train = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/binomial_20_cols_10KRows.csv")) for ind in range(10): train[ind] = train[ind].asfactor() train["C21"] = train["C21"].asfactor() frames = train.split_frame(ratios=[0.8], seed=12345) d = frames[0] d_test = frames[1] regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] # compare results when validation dataset is present mLVal = glm(family='binomial', alpha=[0.1, 0.5], lambda_search=True, solver='COORDINATE_DESCENT', nlambdas=3) # train with validations set mLVal.train(training_frame=d, x=list(range(20)), y=20, validation_frame=d_test) rVal = glm.getGLMRegularizationPath(mLVal) best_submodel_indexVal = mLVal._model_json["output"]["best_submodel_index"] m2Val = glm.makeGLMModel( model=mLVal, coefs=rVal['coefficients'][best_submodel_indexVal]) dev1Val = rVal['explained_deviance_valid'][best_submodel_indexVal] p2Val = m2Val.model_performance(d_test) dev2Val = 1 - p2Val.residual_deviance() / p2Val.null_deviance() assert abs(dev1Val - dev2Val) < 1e-6 for l in range(0, len(rVal['lambdas'])): m = glm(family='binomial', alpha=[rVal['alphas'][l]], Lambda=rVal['lambdas'][l], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=list(range(20)), y=20, validation_frame=d_test) mr = glm.getGLMRegularizationPath(m) p = m.model_performance(d_test) cs = rVal['coefficients'][l] cs_norm = rVal['coefficients_std'][l] print("Comparing submodel index {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-1) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-1) pyunit_utils.assertEqualRegPaths(regKeys, rVal, l, mr, tol=1e-3) dVal = 1 - p.residual_deviance() / p.null_deviance() if l == best_submodel_indexVal: # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["validation_metrics"], mLVal._model_json["output"]["validation_metrics"], tol=1e-2) else: # for other submodel, should have worse residual_deviance() than best submodel assert dVal <= dev2Val, "Best submodel does not have highest explained deviance_valid for submodel: !".format( l)
def glm_alpha_array_lambda_null(): # first test: compare coefficients and deviance d = h2o.import_file( path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) mL = glm(family='multinomial', alpha=[0.1, 0.5, 0.9], lambda_search=True, solver='COORDINATE_DESCENT', cold_start=True, nlambdas=5) d[54] = d[54].asfactor() mL.train(training_frame=d, x=list(range(0, 54)), y=54) r = glm.getGLMRegularizationPath(mL) regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] best_submodel_index = mL._model_json["output"]["best_submodel_index"] coefClassSet = [ 'coefs_class_0', 'coefs_class_1', 'coefs_class_2', 'coefs_class_3', 'coefs_class_4', 'coefs_class_5', 'coefs_class_6', 'coefs_class_7' ] coefClassSetNorm = [ 'std_coefs_class_0', 'std_coefs_class_1', 'std_coefs_class_2', 'std_coefs_class_3', 'std_coefs_class_4', 'std_coefs_class_5', 'std_coefs_class_6', 'std_coefs_class_7' ] for l in range(0, len(r['lambdas'])): print("compare models for index {0}, alpha {1}, lambda{2}".format( l, r['alphas'][l], r['lambdas'][l])) m = glm(family='multinomial', alpha=[r['alphas'][l]], Lambda=[r['lambdas'][l]], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=list(range(0, 54)), y=54) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] pyunit_utils.assertCoefEqual(cs, m.coef(), coefClassSet) pyunit_utils.assertCoefEqual(cs_norm, m.coef_norm(), coefClassSetNorm) devm = 1 - m.residual_deviance() / m.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-6 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr) if (l == best_submodel_index ): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"], keySet=["MSE", "null_deviance", "logloss", "RMSE", "r2"], tol=5e-1) else: # for other submodel, should have worse residual_deviance() than best submodel assert devm <= r['explained_deviance_train'][best_submodel_index], "Best submodel does not best " \ "explained_deviance_train!"
def glm_alpha_lambda_arrays(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial', Lambda=[0.9, 0.5, 0.1], alpha=[0.1, 0.5, 0.9], solver='COORDINATE_DESCENT') mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(mL) regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] best_submodel_index = mL._model_json["output"]["best_submodel_index"] m2 = glm.makeGLMModel(model=mL, coefs=r['coefficients'][best_submodel_index]) dev1 = r['explained_deviance_train'][best_submodel_index] p2 = m2.model_performance(d) dev2 = 1 - p2.residual_deviance() / p2.null_deviance() assert abs(dev1 - dev2) < 1e-6 for l in range(0, len(r['lambdas'])): m = glm(family='binomial', alpha=[r['alphas'][l]], Lambda=[r['lambdas'][l]], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] diff = 0 diff2 = 0 for n in cs.keys(): diff = max(diff, abs((cs[n] - m.coef()[n]))) diff2 = max(diff2, abs((cs_norm[n] - m.coef_norm()[n]))) assert diff < 1e-2 assert diff2 < 1e-2 p = m.model_performance(d) devm = 1 - p.residual_deviance() / p.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-5) if (l == best_submodel_index ): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"], tol=1e-5) else: # for other submodel, should have worse residual_deviance() than best submodel assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \ "residual_deviance()!"
def glm_alpha_array_lambda_null(): # first test: compare coefficients and deviance d = h2o.import_file( path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) mL = glm(family='multinomial', alpha=[0.1, 0.5, 0.9]) d[54] = d[54].asfactor() mL.train(training_frame=d, x=list(range(0, 54)), y=54) r = glm.getGLMRegularizationPath(mL) regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] best_submodel_index = mL._model_json["output"]["best_submodel_index"] coefClassSet = [ 'coefs_class_0', 'coefs_class_1', 'coefs_class_2', 'coefs_class_3', 'coefs_class_4', 'coefs_class_5', 'coefs_class_6', 'coefs_class_7' ] coefClassSetNorm = [ 'std_coefs_class_0', 'std_coefs_class_1', 'std_coefs_class_2', 'std_coefs_class_3', 'std_coefs_class_4', 'std_coefs_class_5', 'std_coefs_class_6', 'std_coefs_class_7' ] for l in range(0, len(r['lambdas'])): m = glm(family='multinomial', alpha=[r['alphas'][l]], Lambda=[r['lambdas'][l]]) m.train(training_frame=d, x=list(range(0, 54)), y=54) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] pyunit_utils.assertCoefEqual(cs, m.coef(), coefClassSet, tol=1e-5) pyunit_utils.assertCoefEqual(cs_norm, m.coef_norm(), coefClassSetNorm, tol=1e-5) devm = 1 - m.residual_deviance() / m.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr) if (l == best_submodel_index ): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"], tol=1e-2) else: # for other submodel, should have worse residual_deviance() than best submodel assert m.logloss() >= mL.logloss(), "Best submodel does not have lowerest " \ "logloss()!"
def test_makeGLMModel(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] m = glm(family='gaussian', Lambda=[0.001], alpha=[0.5]) m.train(training_frame=d, x=myX, y=myY) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0]) f1 = m.predict(d) # predict with original model f2 = m2.predict(d) # predict with model out of makeGLMModel pyunit_utils.compare_frames_local(f1, f2, prob=1) coefs = r['coefficients'][0] coefs['wendy_dreams'] = 8 try: glm.makeGLMModel(model=m, coefs=coefs) assert False, "Should have throw exception of bad coefficient length" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 9 is different from coefficient provided by user ") in temp, \ "Wrong exception was received." print("coefficient test passed!")
def test_glm_multinomial_coeffs(): trainF = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv")) y = "species" x = [0,1,2,3] bin_LS = glm(family='multinomial', seed=12345) bin_LS.train(x=x, y=y, training_frame=trainF) print(bin_LS.summary()) coefficient_table_original = bin_LS._model_json["output"]["coefficients_table"] coefficient_table = bin_LS._model_json["output"]["coefficients_table_multinomials_with_class_names"] coeffNamesOld = coefficient_table_original.col_header coeffNames = coefficient_table.col_header validCoefficientNames = [u"names", u"coefs_class_Iris-setosa", u"coefs_class_Iris-versicolor", u"coefs_class_Iris-virginica", u"std_coefs_class_Iris-setosa", u"std_coefs_class_Iris-versicolor", u"std_coefs_class_Iris-virginica"] oldCoefficientNames = [u"names", u"coefs_class_0", u"coefs_class_1", u"coefs_class_2", u"std_coefs_class_0", u"std_coefs_class_1", u"std_coefs_class_2"] print(coefficient_table) print(coefficient_table_original) # compare coefficient names assert len(set(coeffNames).intersection(validCoefficientNames))==len(coeffNames),\ "Expected coefficient names: {0}. Actual coefficient names: {1}".format(validCoefficientNames, coeffNames) assert len(set(coeffNamesOld).intersection(oldCoefficientNames))==len(coeffNames), \ "Expected original coefficient names: {0}. Actual original coefficient names: " \ "{1}".format(oldCoefficientNames, coeffNamesOld) # compare table contents to make sure they contain the same values pyunit_utils.assert_H2OTwoDimTable_equal_upto(coefficient_table_original, coefficient_table, [u'coefs_class_0'], tolerance=1e-10)
def test_glm_multinomial_makeGLMModel(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) mL = glm(family='multinomial', alpha=[0.1], Lambda=[0.9]) d[54] = d[54].asfactor() mL.train(training_frame=d, x=list(range(0, 54)), y=54) r = glm.getGLMRegularizationPath(mL) rank = check_nonzero_coefs(r['coefficients'][0]) assert rank == mL._model_json["output"]["rank"], "expected rank: {0}, actual rank: {1}." \ "".format(rank, mL._model_json["output"]["rank"]) m2 = glm.makeGLMModel( model=mL, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model f1 = mL.predict(d) f2 = m2.predict(d) pyunit_utils.compare_frames_local(f1, f2, prob=1) coefs = r['coefficients'][0] coefs[ "wendy_dreams"] = 0.123 # add extra coefficients to model coefficient try: glm.makeGLMModel(model=mL, coefs=coefs) assert False, "Should have thrown an exception!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 371 is different from coefficient provided by user") in temp, \ "Wrong exception was received." print("glm Multinomial makeGLMModel test completed!")
def testOrdinalLogit(): Dtrain = h2o.import_file( pyunit_utils.locate( "bigdata/laptop/glm_ordinal_logit/ordinal_ordinal_20_training_set.csv" )) Dtrain["C21"] = Dtrain["C21"].asfactor() print("Fit model on dataset") model = glm(family="ordinal", alpha=[0.5], lambda_=[0.001], max_iterations=1000, beta_epsilon=1e-8, objective_epsilon=1e-8) model.train(x=list(range(0, 20)), y="C21", training_frame=Dtrain) predH2O = model.predict(Dtrain) r = glm.getGLMRegularizationPath(model) m2 = glm.makeGLMModel( model=model, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model f2 = m2.predict(Dtrain) pyunit_utils.compare_frames_local(predH2O, f2, prob=1) coefs = r['coefficients'][0] coefs['h2o_dream'] = 3.1415 try: glm.makeGLMModel(model=model, coefs=coefs) assert False, "Should have thrown an exception!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 189 is different from coefficient provided by user ") in temp, \ "Wrong exception was received." print("coefficient test passed!")
def test_makeGLMModel(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) m = glm(family='binomial', Lambda=[0.001], alpha=[0.5], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0]) f1 = m.predict(d) # predict with original model f2 = m2.predict(d) # predict with model out of makeGLMModel pyunit_utils.compare_frames_local(f1[1], f2[1], prob=1) coefs = r['coefficients'][0] coefs['wendy_dreams'] = 8 try: glm.makeGLMModel(model=m, coefs=coefs) assert False, "Test failed: should have throw exception of bad coefficient length!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 8 is different from coefficient provided by user ") in temp,\ "Wrong exception was received." print("makeGLMModel test passed!")
def test_glm_scoring_history_TomasF(): df = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) df["CAPSULE"] = df["CAPSULE"].asfactor() glmModel = glm(generate_scoring_history=True) glmModel.train(y="CAPSULE", training_frame=df) glmModel.scoring_history()
def testGLMBinomialScoringHistory(): col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_logloss", "validation_logloss", "training_classification_error", "validation_classification_error", "training_rmse", "validation_rmse", "training_auc", "validation_auc", "training_pr_auc", "validation_pr_auc", "training_lift", "validation_lift"] h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) for ind in range(10): h2o_data[ind] = h2o_data[ind].asfactor() h2o_data["C21"] = h2o_data["C21"].asfactor() splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234) train = splits_frames[0] valid = splits_frames[1] Y = "C21" X = list(range(0,20)) print("Building model with score_interval=1. Should generate same model as " "score_each_iteration turned on.") h2o_model = glm(family="binomial", score_iteration_interval=1) h2o_model.train(x=X, y=Y, training_frame=train, validation_frame=valid) print("Building model with score_each_iteration turned on.") h2o_model_score_each = glm(family="binomial", score_each_iteration=True) h2o_model_score_each.train(x=X, y=Y, training_frame=train, validation_frame=valid) # scoring history from h2o_model_score_each and h2o_model should be the same pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model, col_list_compare) print("Building model with score_each_iteration turned on, with CV.") h2o_model_score_each_cv = glm(family="binomial", score_each_iteration=True, nfolds=3, fold_assignment='modulo', seed=1234) h2o_model_score_each_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid) print("Building model with score_interval=1, and CV. Should generate same model as score_each_iteration turned " "on, with lambda search and CV.") h2o_model_cv = glm(family="binomial", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234) h2o_model_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid) # scoring history from h2o_model_score_each_cv and h2o_model_cv should be the same pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv, h2o_model_cv, col_list_compare) # check if scoring_interval is set to 4, the output should be the same for every fourth iteration h2o_model_cv_4th = glm(family="binomial", score_iteration_interval=4, nfolds=3, fold_assignment='modulo', seed=1234) h2o_model_cv_4th.train(x=X, y=Y, training_frame=train, validation_frame=valid) pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv, h2o_model_cv_4th, col_list_compare)
def set_glm_startvals(): # read in the dataset and construct training set (and validation set) d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial') mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) mLcoeff = mL.coef() r = glm.getGLMRegularizationPath(mL) rcoeff = r["coefficients"][0] responseMean = d[1].mean() initIntercept = math.log(responseMean/(1.0-responseMean)) startval1 = [0,0,0,0,0,0,0,initIntercept] startval2 = [rcoeff["AGE"], rcoeff["RACE"], rcoeff["DPROS"], rcoeff["DCAPS"], rcoeff["PSA"], rcoeff["VOL"], rcoeff["GLEASON"], rcoeff["Intercept"]] startvalBad = [0,0] ml1 = glm(family="binomial", startval = startval1) # same starting condition as GLM ml1.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) ml1Coeff = ml1.coef() pyunit_utils.assertEqualCoeffDicts(mLcoeff, ml1Coeff , tol = 1e-6) # coeffs should be the same ml2 = glm(family="binomial", startval = startval2) # different starting condition from GLM ml2.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) ml2Coeff = ml2.coef() try: pyunit_utils.assertEqualCoeffDicts(mLcoeff, ml2Coeff , tol = 1e-6) assert False, "Should have thrown an error as coefficients are different!" except Exception as ex: print(ex) try: mlbad = glm(family="binomial", startval = startvalBad) mlbad.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) assert False, "Should have thrown an error with bad GLM initial values!" except Exception as ex: print(ex) print("Test completed! Success!")
def binomial_plot_test(): benign = h2o.import_file( pyunit_utils.locate("smalldata/logreg/benign.csv")) response = 3 predictors = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10] model = glm(family="binomial") model.train(x=predictors, y=response, training_frame=benign) # test saving: with TemporaryDirectory() as tmpdir: path1 = "{}/plot1.png".format(tmpdir) path2 = "{}/plot2.png".format(tmpdir) test_plot_result_saving( model.plot(timestep="AUTO", metric="objective", server=True), path2, model.plot(timestep="AUTO", metric="objective", server=True, save_plot_path=path1), path1) test_plot_result_saving( model.permutation_importance_plot(benign), path2, model.permutation_importance_plot(benign, save_plot_path=path1), path1)
def test_modelselection_gaussian(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] model_maxr = modelSelection(seed=12345, max_predictor_number=3, mode="maxr") model_maxr.train(training_frame=d, x=my_x, y=my_y) model_allsubsets = modelSelection(seed=12345, max_predictor_number=3, mode="allsubsets") model_allsubsets.train(training_frame=d, x=my_x, y=my_y) best_r2_value_allsubsets = model_allsubsets.get_best_R2_values() best_predictor_names_allsubsets = model_allsubsets.get_best_model_predictors( ) best_r2_value_maxr = model_maxr.get_best_R2_values() # assert that model returned with one predictor found by modelselection is the best by comparing it to manual training result one_pred_r2 = [] for pred in my_x: x = [pred] m = glm(seed=12345) m.train(training_frame=d, x=x, y=my_y) one_pred_r2.append(m.r2()) best_r2 = max(one_pred_r2) assert abs(best_r2-best_r2_value_allsubsets[0]) < 1e-6, "expected best r2: {0}, allsubset: actual best r2:{1}. " \ " They are different.".format(best_r2, best_r2_value_allsubsets[0]) assert abs(best_r2-best_r2_value_maxr[0]) < 1e-6, "expected best r2: {0}, maxr: actual best r2:{1}. " \ " They are different.".format(best_r2, best_r2_value_maxr[0]) assert abs(best_r2_value_allsubsets[0]-best_r2_value_maxr[0]) < 1e-6, "allsubset best r2: {0}, maxr best r2:{1}. " \ " They are different." \ "".format(best_r2_value_allsubsets[0], best_r2_value_maxr[0]) print("Best one predictor model uses predictor: {0}".format( best_predictor_names_allsubsets[0])) my_x3 = [["AGE", "RACE", "CAPSULE"], ["AGE", "RACE", "DCAPS"], ["AGE", "RACE", "PSA"], ["AGE", "RACE", "VOL"], ["AGE", "RACE", "DPROS"], ["AGE", "CAPSULE", "DCAPS"], ["AGE", "CAPSULE", "PSA"], ["AGE", "CAPSULE", "VOL"], ["AGE", "CAPSULE", "DPROS"], ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "VOL"], ["AGE", "DCAPS", "DPROS"], ["AGE", "PSA", "VOL"], ["AGE", "PSA", "VOL"], ["AGE", "PSA", "DPROS"], ["AGE", "VOL", "DPROS"], ["RACE", "CAPSULE", "DCAPS"], ["RACE", "CAPSULE", "PSA"], ["RACE", "CAPSULE", "VOL"], ["RACE", "CAPSULE", "DPROS"], ["RACE", "DCAPS", "PSA"], ["RACE", "DCAPS", "VOL"], ["RACE", "DCAPS", "DPROS"], ["RACE", "PSA", "VOL"], ["RACE", "PSA", "DPROS"], ["RACE", "VOL", "DPROS"], ["CAPSULE", "DCAPS", "PSA"], ["CAPSULE", "DCAPS", "VOL"], ["CAPSULE", "DCAPS", "DPROS"], ["DCAPS", "PSA", "VOL"], ["DCAPS", "PSA", "DPROS"], ["DCAPS", "VOL", "DPROS"], ["PSA", "VOL", "DPROS"]] two_pred_r2 = [] for pred2 in my_x3: x = pred2 m = glm(seed=12345) m.train(training_frame=d, x=x, y=my_y) two_pred_r2.append(m.r2()) best_r2_two_pred = max(two_pred_r2) assert abs(best_r2_two_pred-best_r2_value_allsubsets[2]) < 1e-6, "expected best r2: {0}, allsubsets: actual best " \ "r2:{1}. They are different." \ "".format(best_r2_two_pred, best_r2_value_allsubsets[2]) assert abs(best_r2_two_pred-best_r2_value_maxr[2]) < 1e-6, "expected best r2: {0}, maxr: actual best " \ "r2:{1}. They are different." \ "".format(best_r2_two_pred, best_r2_value_maxr[2]) assert abs(best_r2_value_allsubsets[2]-best_r2_value_maxr[2]) < 1e-6, "allsubset best r2: {0}, maxr: actual best " \ "r2:{1}. They are different." \ "".format(best_r2_value_allsubsets[2], best_r2_value_maxr[2]) print("Best three predictors model uses predictors: {0}".format( best_predictor_names_allsubsets[2]))