def glm_alpha_array_with_lambda_search_cv(): # read in the dataset and construct training set (and validation set) print("Testing glm cross-validation with alpha array, lambda_search for multiomial models.") h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # build model with CV but no validation dataset cv_model = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3) cv_model.train(training_frame=training_data,x=myX,y=myY) cv_r = glm.getGLMRegularizationPath(cv_model) # build model with CV and with validation dataset cv_model_valid = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3) cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY) cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid) for l in range(0,len(cv_r['lambdas'])): print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)
def buildModelCheckpointing(training_frame, x_indices, y_index, family, solver): split_frames = training_frame.split_frame(ratios=[0.9], seed=12345) model = H2OGeneralizedLinearEstimator(family=family, max_iterations=7, solver=solver) model.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelCheckpoint = H2OGeneralizedLinearEstimator(family=family, checkpoint=model.model_id, solver=solver) modelCheckpoint.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelLong = H2OGeneralizedLinearEstimator( family=family, solver=solver) # allow to run to completion modelLong.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(), modelLong.coef(), tol=5e-2)
def test_glm_backward_compare(): tst_data = h2o.import_file( pyunit_utils.locate( "bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv")) predictors = tst_data.columns[0:-1] response_col = 'response' weight = 'wt' tst_data['wt'] = 1 tst_data[tst_data['response'] == 1, 'wt'] = 100 tst_data['response'] = tst_data['response'].asfactor() min_predictor_num = 200 backward_model = H2OModelSelectionEstimator( family='binomial', weights_column=weight, mode='backward', min_predictor_number=min_predictor_num) backward_model.train(predictors, response_col, training_frame=tst_data) backward_model_coeff = backward_model.coef()[0] glm_model = H2OGeneralizedLinearEstimator(family='binomial', lambda_=0, compute_p_values=True, weights_column=weight) glm_model.train(predictors, response_col, training_frame=tst_data) glm_coeff = glm_model.coef() pyunit_utils.assertEqualCoeffDicts(glm_coeff, backward_model_coeff, tol=1e-6)
def glm_alpha_array_lambda_null(): # first test: compare coefficients and deviance d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial',alpha=[0.1,0.5,0.9],solver='COORDINATE_DESCENT') mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) r = glm.getGLMRegularizationPath(mL) regKeys = ["alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train"] best_submodel_index = mL._model_json["output"]["best_submodel_index"] m2 = glm.makeGLMModel(model=mL,coefs=r['coefficients'][best_submodel_index]) dev1 = r['explained_deviance_train'][best_submodel_index] p2 = m2.model_performance(d) dev2 = 1-p2.residual_deviance()/p2.null_deviance() print(dev1," =?= ",dev2) assert abs(dev1 - dev2) < 1e-6 for l in range(0,len(r['lambdas'])): m = glm(family='binomial',alpha=[r['alphas'][l]],Lambda=[r['lambdas'][l]],solver='COORDINATE_DESCENT') m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] pyunit_utils.assertEqualCoeffDicts(cs, m.coef()) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm()) p = m.model_performance(d) devm = 1-p.residual_deviance()/p.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr) if (l == best_submodel_index): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics(m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"]) else: # for other submodel, should have worse residual_deviance() than best submodel assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \ "residual_deviance()!"
def grab_lambda_min(): boston = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv")) # set the predictor names and the response column name predictors = boston.columns[:-1] # set the response column to "medv", the median value of owner-occupied homes in $1000's response = "medv" # convert the chas column to a factor (chas = Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)) boston['chas'] = boston['chas'].asfactor() # split into train and validation sets train, valid = boston.split_frame(ratios = [.8], seed=1234) boston_glm = H2OGeneralizedLinearEstimator(lambda_search = True, seed=1234, cold_start=True) boston_glm.train(x = predictors, y = response, training_frame = train, validation_frame = valid) r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(boston_glm) for l in range(0,len(r['lambdas'])): m = H2OGeneralizedLinearEstimator(alpha=[r['alphas'][l]],Lambda=r['lambdas'][l], solver='COORDINATE_DESCENT') m.train(x = predictors, y = response, training_frame = train, validation_frame = valid) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-6) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-6)
def buildModelCheckpointing(training_frame, x_indices, y_index, family, solver, cold_start): split_frames = training_frame.split_frame(ratios=[0.9], seed=12345) model = H2OGeneralizedLinearEstimator(family=family, max_iterations=3, solver=solver, lambda_search=True, cold_start=cold_start) model.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelCheckpoint = H2OGeneralizedLinearEstimator(family=family, checkpoint=model.model_id, solver=solver, lambda_search=True, cold_start=cold_start) modelCheckpoint.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelLong = H2OGeneralizedLinearEstimator(family=family, solver=solver, lambda_search=True, cold_start=cold_start) modelLong.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(), modelLong.coef(), tol=1e-6)
def glm_alpha_lambda_arrays_cv(): print("Testing glm cross-validation with alpha array, lambda array for binomial models.") h2o_data = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # choices made in model_all and model_xval should be the same since they should be using xval metrics model_all = glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True) model_all.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) model_all_rpath = glm.getGLMRegularizationPath(model_all) model_xval = glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True) model_xval.train(x=myX, y=myY, training_frame = training_data) model_xval_rpath = glm.getGLMRegularizationPath(model_xval) for l in range(0,len(model_all_rpath['lambdas'])): print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients'][l], model_xval_rpath['coefficients'][l], tol=1e-6) pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients_std'][l], model_xval_rpath['coefficients_std'][l], tol=1e-6)
def glm_alpha_arrays_null_lambda_cv(): print("Testing glm cross-validation with alpha array, default lambda values for binomial models.") h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" h2o_data["C21"] = h2o_data["C21"].asfactor() myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # build model with CV but no validation dataset cv_model = glm(family='binomial',alpha=[0.1,0.5,0.9], nfolds = 3, fold_assignment="modulo") cv_model.train(training_frame=training_data,x=myX,y=myY) cv_r = glm.getGLMRegularizationPath(cv_model) # build model with CV and with validation dataset cv_model_valid = glm(family='binomial',alpha=[0.1,0.5,0.9], nfolds = 3, fold_assignment="modulo") cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY) cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid) for l in range(0,len(cv_r['lambdas'])): print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)
def glm_alpha_lambda_arrays(): # compare coefficients and deviance when only training dataset is available train = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/binomial_20_cols_10KRows.csv")) for ind in range(10): train[ind] = train[ind].asfactor() train["C21"] = train["C21"].asfactor() frames = train.split_frame(ratios=[0.8], seed=12345) d = frames[0] d_test = frames[1] regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] # compare results when validation dataset is present mLVal = glm(family='binomial', alpha=[0.1, 0.5], lambda_search=True, solver='COORDINATE_DESCENT', nlambdas=3) # train with validations set mLVal.train(training_frame=d, x=list(range(20)), y=20, validation_frame=d_test) rVal = glm.getGLMRegularizationPath(mLVal) best_submodel_indexVal = mLVal._model_json["output"]["best_submodel_index"] m2Val = glm.makeGLMModel( model=mLVal, coefs=rVal['coefficients'][best_submodel_indexVal]) dev1Val = rVal['explained_deviance_valid'][best_submodel_indexVal] p2Val = m2Val.model_performance(d_test) dev2Val = 1 - p2Val.residual_deviance() / p2Val.null_deviance() assert abs(dev1Val - dev2Val) < 1e-6 for l in range(0, len(rVal['lambdas'])): m = glm(family='binomial', alpha=[rVal['alphas'][l]], Lambda=rVal['lambdas'][l], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=list(range(20)), y=20, validation_frame=d_test) mr = glm.getGLMRegularizationPath(m) p = m.model_performance(d_test) cs = rVal['coefficients'][l] cs_norm = rVal['coefficients_std'][l] print("Comparing submodel index {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-1) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-1) pyunit_utils.assertEqualRegPaths(regKeys, rVal, l, mr, tol=1e-3) dVal = 1 - p.residual_deviance() / p.null_deviance() if l == best_submodel_indexVal: # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["validation_metrics"], mLVal._model_json["output"]["validation_metrics"], tol=1e-2) else: # for other submodel, should have worse residual_deviance() than best submodel assert dVal <= dev2Val, "Best submodel does not have highest explained deviance_valid for submodel: !".format( l)
def test_gridsearch(): h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv")) h2o_data['response'] = h2o_data['response'].asfactor() h2o_data['C3'] = h2o_data['C3'].asfactor() h2o_data['C7'] = h2o_data['C7'].asfactor() h2o_data['C8'] = h2o_data['C8'].asfactor() h2o_data['C10'] = h2o_data['C10'].asfactor() names = h2o_data.names myY = "response" myX = names.remove(myY) search_criteria = {'strategy': 'Cartesian'} hyper_parameters = { 'lambda': [1, 2], 'subspaces': [{ 'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs': [[1], [0]], 'gam_columns': [[["c_0"]], [["c_1"]]] }, { 'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 'bs': [[1, 1, 1], [0, 1, 1]], 'num_knots': [[5, 10, 12], [6, 11, 13]], 'gam_columns': [[["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]], [["c_1"], ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]] }] } hyper_parameters2 = { 'lambda': [1, 2], 'subspaces': [{ 'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs': [[1], [0]], 'gam_columns': [[["c_0"]], [["c_1"]]] }, { 'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 'bs': [[1, 1, 1], [0, 1, 1]], 'num_knots': [[5, 10, 12], [6, 11, 13]], 'gam_columns': [["c_0", ["c_1", "c_2"], ["c_3", "c_4", "c_5"]], ["c_1", ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]] }] } h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator( family="binomial", keep_gam_cols=True), hyper_params=hyper_parameters, search_criteria=search_criteria) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) h2o_model2 = H2OGridSearch(H2OGeneralizedAdditiveEstimator( family="binomial", keep_gam_cols=True), hyper_params=hyper_parameters2, search_criteria=search_criteria) h2o_model2.train(x=myX, y=myY, training_frame=h2o_data) # compare two models by checking their coefficients. They should be the same for index in range(0, len(h2o_model)): model1 = h2o_model[index] model2 = h2o_model2[index] pyunit_utils.assertEqualCoeffDicts(model1.coef(), model2.coef(), tol=1e-6)
def test_gam_model_predict(): covtype_df = h2o.import_file( pyunit_utils.locate("bigdata/laptop/covtype/covtype.full.csv")) train, valid = covtype_df.split_frame([0.9], seed=1234) #Prepare predictors and response columns covtype_X = covtype_df.col_names[: -1] #last column is Cover_Type, our desired response variable covtype_y = covtype_df.col_names[-1] # build model with cross validation and no validation dataset gam_multi = H2OGeneralizedAdditiveEstimator(family='multinomial', solver='IRLSM', gam_columns=["Slope"], scale=[0.0001], num_knots=[5], standardize=True, nfolds=2, fold_assignment='modulo', alpha=[0.9, 0.5, 0.1], lambda_search=True, nlambdas=5, max_iterations=3) gam_multi.train(covtype_X, covtype_y, training_frame=train) # build model with cross validation and with validation dataset gam_multi_valid = H2OGeneralizedAdditiveEstimator(family='multinomial', solver='IRLSM', gam_columns=["Slope"], scale=[0.0001], num_knots=[5], standardize=True, nfolds=2, fold_assignment='modulo', alpha=[0.9, 0.5, 0.1], lambda_search=True, nlambdas=5, max_iterations=3) gam_multi_valid.train(covtype_X, covtype_y, training_frame=train, validation_frame=valid) # model should yield the same coefficients in both case gam_multi_coef = gam_multi.coef() gam_multi_valid_coef = gam_multi_valid.coef() pyunit_utils.assertEqualCoeffDicts(gam_multi_coef['coefficients'], gam_multi_valid_coef['coefficients'])
def buildModelCheckpointing(training_frame, x_indices, y_index, family, solver, cold_start, nlambdas): split_frames = training_frame.split_frame(ratios=[0.9], seed=12345) model = H2OGeneralizedLinearEstimator(family=family, max_iterations=3, solver=solver, lambda_search=True, cold_start=cold_start, nlambdas=nlambdas) model.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelCheckpoint = H2OGeneralizedLinearEstimator(family=family, checkpoint=model.model_id, solver=solver, lambda_search=True, cold_start=cold_start, nlambdas=nlambdas) modelCheckpoint.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) # allow to run to completion modelLong = H2OGeneralizedLinearEstimator(family=family, solver=solver, lambda_search=True, cold_start=cold_start, nlambdas=nlambdas) modelLong.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) checkpointCoef = modelCheckpoint.coef() longCoef = modelLong.coef() for key in longCoef.keys(): pyunit_utils.assertEqualCoeffDicts(checkpointCoef[key], longCoef[key], tol=1e-6)
def match_models(self): for model in self.manual_gam_models: scale = model.actual_params['scale'] gam_columns = model.actual_params['gam_columns'] num_knots = model.actual_params['num_knots'] lambda_ = model.actual_params['lambda'] bsVal = model.actual_params['bs'] for grid_search_model in self.h2o_model.models: if grid_search_model.actual_params['gam_columns'] == gam_columns \ and grid_search_model.actual_params['scale'] == scale \ and grid_search_model.actual_params['num_knots'] == num_knots \ and grid_search_model.actual_params['bs'] == bsVal \ and grid_search_model.actual_params['lambda'] == lambda_: self.num_grid_models += 1 print("grid model number "+str(self.num_grid_models)) print("gridSearch model coefficients") print(grid_search_model.coef()) print("manual model coefficients") print(model.coef()) pyunit_utils.assertEqualCoeffDicts(grid_search_model.coef(), model.coef(), tol=1e-6) break assert self.num_grid_models == self.num_expected_models, "Grid search model parameters incorrect or incorrect number of models generated"
def set_glm_startvals(): # read in the dataset and construct training set (and validation set) d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial') mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) mLcoeff = mL.coef() r = glm.getGLMRegularizationPath(mL) rcoeff = r["coefficients"][0] responseMean = d[1].mean() initIntercept = math.log(responseMean/(1.0-responseMean)) startval1 = [0,0,0,0,0,0,0,initIntercept] startval2 = [rcoeff["AGE"], rcoeff["RACE"], rcoeff["DPROS"], rcoeff["DCAPS"], rcoeff["PSA"], rcoeff["VOL"], rcoeff["GLEASON"], rcoeff["Intercept"]] startvalBad = [0,0] ml1 = glm(family="binomial", startval = startval1) # same starting condition as GLM ml1.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) ml1Coeff = ml1.coef() pyunit_utils.assertEqualCoeffDicts(mLcoeff, ml1Coeff , tol = 1e-6) # coeffs should be the same ml2 = glm(family="binomial", startval = startval2) # different starting condition from GLM ml2.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) ml2Coeff = ml2.coef() try: pyunit_utils.assertEqualCoeffDicts(mLcoeff, ml2Coeff , tol = 1e-6) assert False, "Should have thrown an error as coefficients are different!" except Exception as ex: print(ex) try: mlbad = glm(family="binomial", startval = startvalBad) mlbad.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) assert False, "Should have thrown an error with bad GLM initial values!" except Exception as ex: print(ex) print("Test completed! Success!")
def test_gam_model_predict(): train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv")) valid = h2o.import_file(pyunit_utils.locate("bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv")) x = ["C1","C2","C3"] y = "response" train[y] = train[y].asfactor() valid[y] = valid[y].asfactor() # build model with cross validation and with validation dataset gam_model_valid = H2OGeneralizedAdditiveEstimator(family='binomial', solver='IRLSM', gam_columns=["C4"], scale = [0.0001], num_knots=[5], standardize=True, nfolds=2, fold_assignment = 'modulo', alpha=[0.9,0.5,0.1], lambda_search=True, nlambdas=5, max_iterations=3, bs=[2], seed=12345) gam_model_valid.train(x, y, training_frame=train, validation_frame=valid) # build model with cross validation and no validation dataset gam_model = H2OGeneralizedAdditiveEstimator(family='binomial', solver='IRLSM', gam_columns=["C4"], scale = [0.0001], num_knots=[5], standardize=True, nfolds=2, fold_assignment = 'modulo', alpha=[0.9,0.5,0.1], lambda_search=True, nlambdas=5, max_iterations=3, bs=[2], seed=12345) gam_model.train(x, y, training_frame=train) # model should yield the same coefficients in both case gam_model_coef = gam_model.coef() gam_model_valid_coef = gam_model_valid.coef() pyunit_utils.assertEqualCoeffDicts(gam_model_coef, gam_model_valid_coef)
def glm_alpha_lambda_arrays(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial', Lambda=[0.9, 0.5, 0.1], alpha=[0.1, 0.5, 0.9], solver='COORDINATE_DESCENT', cold_start=False) mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(mL) regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] best_submodel_index = mL._model_json["output"]["best_submodel_index"] m2 = glm.makeGLMModel(model=mL, coefs=r['coefficients'][best_submodel_index]) dev1 = r['explained_deviance_train'][best_submodel_index] p2 = m2.model_performance(d) dev2 = 1 - p2.residual_deviance() / p2.null_deviance() print(dev1, " =?= ", dev2) assert abs(dev1 - dev2) < 1e-6 responseMean = d[1].mean() initIntercept = math.log(responseMean / (1.0 - responseMean)) startValInit = [0, 0, 0, 0, 0, 0, 0, initIntercept] startVal = [0, 0, 0, 0, 0, 0, 0, initIntercept] orderedCoeffNames = [ "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "Intercept" ] for l in range(0, len(r['lambdas'])): m = glm(family='binomial', alpha=[r['alphas'][l]], Lambda=[r['lambdas'][l]], solver='COORDINATE_DESCENT', startval=startVal) m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-3) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), 1e-3) if (l + 1) < len( r['lambdas']) and r['alphas'][l] != r['alphas'][l + 1]: startVal = startValInit else: startVal = pyunit_utils.extractNextCoeff( cs_norm, orderedCoeffNames, startVal) # prepare startval for next round p = m.model_performance(d) devm = 1 - p.residual_deviance() / p.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-4) if (l == best_submodel_index ): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"], tol=1e-4) else: # for other submodel, should have worse residual_deviance() than best submodel assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \ "residual_deviance()!"
def test_multinomial_alpha(): col_list_compare = [ "iterations", "objective", "negative_log_likelihood", "training_logloss", "validation_logloss", "training_classification_error", "validation_classification_error" ] print("Preparing dataset....") h2o_data = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() h2o_data["C3"] = h2o_data["C3"].asfactor() h2o_data["C4"] = h2o_data["C4"].asfactor() h2o_data["C5"] = h2o_data["C5"].asfactor() h2o_data["C11"] = h2o_data["C11"].asfactor() splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234) training_data = splits_frames[0] test_data = splits_frames[1] X = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] Y = "C11" print("Building model with score_each_iteration turned on.") # test with lambda search on, generate_scoring_history on and off model1 = glm(family="multinomial", alpha=[0, 0.2, 0.5, 0.8, 1], lambda_search=True, generate_scoring_history=True) model1.train(x=X, y=Y, training_frame=training_data, validation_frame=test_data) model2 = glm(family="multinomial", alpha=[0, 0.2, 0.5, 0.8, 1], lambda_search=True, generate_scoring_history=True) model2.train(x=X, y=Y, training_frame=training_data, validation_frame=test_data) coef1 = model1.coef() coef2 = model2.coef() for key in coef1.keys(): pyunit_utils.assertEqualCoeffDicts(coef1[key], coef2[key], tol=1e-6) # test with lambda search off, generate_scoring_history on and off model1 = glm(family="multinomial", alpha=[0, 0.2, 0.5, 0.8, 1], lambda_search=False, generate_scoring_history=True, Lambda=[0, 0.1, 0.01, 0.001]) model1.train(x=X, y=Y, training_frame=training_data, validation_frame=test_data) model2 = glm(family="multinomial", alpha=[0, 0.2, 0.5, 0.8, 1], lambda_search=False, generate_scoring_history=True, Lambda=[0, 0.1, 0.01, 0.001]) model2.train(x=X, y=Y, training_frame=training_data, validation_frame=test_data) coef1 = model1.coef() coef2 = model2.coef() for key in coef1.keys(): pyunit_utils.assertEqualCoeffDicts(coef1[key], coef2[key], tol=1e-6) # test with lambda search on, generate_scoring_history on and off, cv on model1 = glm(family="multinomial", alpha=[0, 0.2, 0.5, 0.8, 1], lambda_search=True, generate_scoring_history=True, nfolds=2, seed=12345) model1.train(x=X, y=Y, training_frame=training_data, validation_frame=test_data) model2 = glm(family="multinomial", alpha=[0, 0.2, 0.5, 0.8, 1], lambda_search=True, generate_scoring_history=True, nfolds=2, seed=12345) model2.train(x=X, y=Y, training_frame=training_data, validation_frame=test_data) coef1 = model1.coef() coef2 = model2.coef() for key in coef1.keys(): pyunit_utils.assertEqualCoeffDicts(coef1[key], coef2[key], tol=1e-6) # test with lambda search off, generate_scoring_history on and off, cv on model1 = glm(family="multinomial", alpha=[0, 0.2, 0.5, 0.8, 1], lambda_search=False, generate_scoring_history=True, nfolds=2, seed=12345, Lambda=[0, 0.1, 0.01, 0.001]) model1.train(x=X, y=Y, training_frame=training_data, validation_frame=test_data) model2 = glm(family="multinomial", alpha=[0, 0.2, 0.5, 0.8, 1], lambda_search=False, generate_scoring_history=True, nfolds=2, seed=12345, Lambda=[0, 0.1, 0.01, 0.001]) model2.train(x=X, y=Y, training_frame=training_data, validation_frame=test_data) coef1 = model1.coef() coef2 = model2.coef() for key in coef1.keys(): pyunit_utils.assertEqualCoeffDicts(coef1[key], coef2[key], tol=1e-6)