def test_modelselection_gaussian_model_id(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] allsubsets_model = modelSelection(seed=12345, max_predictor_number=7, mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) result_frame_allsubsets = allsubsets_model.result() numRows = result_frame_allsubsets.nrows modelIDs_allsubsets = allsubsets_model._model_json["output"][ "best_model_ids"] maxr_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) result_frame_maxr = maxr_model.result() for ind in list(range(numRows)): model_from_frame_allsubsets = h2o.get_model( result_frame_allsubsets["model_id"][ind, 0]) pred_frame_allsubsets = model_from_frame_allsubsets.predict(d) model_from_frame_allsubsets = h2o.get_model( modelIDs_allsubsets[ind]['name']) pred_id_allsubsets = model_from_frame_allsubsets.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_allsubsets, prob=1) model_from_frame_maxr = h2o.get_model( result_frame_maxr["model_id"][ind, 0]) pred_frame_maxr = model_from_frame_maxr.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_frame_maxr, prob=1, tol=1e-6)
def test_modelselection_backward_serialization(): d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "GLEASON" x = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] # make sure duplicate runs produce same results model_backward = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5, lambda_=0, theta=0.01) model_backward.train(training_frame=d, x=x, y=y) model_backward2 = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5, lambda_=0, theta=0.01) model_backward2.train(training_frame=d, x=x, y=y) result = model_backward.result() # get result frame result2 = model_backward.result() # get result frame pyunit_utils.compare_frames_local(result[2:5], result2[2:5], prob=1.0) # compare result from both models and they should the same num_models = result.nrows # number of models built one_model = h2o.get_model(result["model_id"][num_models-1, 0]) predict_frame = one_model.predict(d) tmpdir = tempfile.mkdtemp() file_dir = os.path.join(tmpdir, "predict.csv") h2o.download_csv(predict_frame, file_dir) # save one scoring frame model_path_backward = model_backward.download_model(tmpdir) # store the model h2o.remove_all() d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) loaded_backward_model = h2o.load_model(model_path_backward) result_frame_backward = loaded_backward_model.result() model_from_frame_backward = h2o.get_model(result_frame_backward["model_id"][num_models-1, 0]) pred_frame_backward = model_from_frame_backward.predict(d) pred_frame_model = h2o.import_file(file_dir) pyunit_utils.compare_frames_local(pred_frame_backward, pred_frame_model, prob=1.0)
def test_gaussian_result_frame_model_id(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] maxr_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) maxrsweep_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxrsweep") maxrsweep_model.train(training_frame=d, x=my_x, y=my_y) # make sure results returned by maxr and maxrsweep are the same pyunit_utils.compare_frames_local(maxr_model.result()[2:4], maxrsweep_model.result()[2:4], prob=1.0, tol=1e-6) allsubsets_model = modelSelection(seed=12345, max_predictor_number=7, mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) result_frame_allsubsets = allsubsets_model.result() numRows = result_frame_allsubsets.nrows best_r2_allsubsets = allsubsets_model.get_best_R2_values() result_frame_maxr = maxr_model.result() best_r2_maxr = maxr_model.get_best_R2_values() for ind in list(range(numRows)): # r2 from attributes best_r2_value_allsubsets = best_r2_allsubsets[ind] one_model_allsubsets = h2o.get_model( result_frame_allsubsets["model_id"][ind, 0]) pred_allsubsets = one_model_allsubsets.predict(d) print("last element of predictor frame: {0}".format( pred_allsubsets[pred_allsubsets.nrows - 1, pred_allsubsets.ncols - 1])) assert pred_allsubsets.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: " \ "{1}".format(pred_allsubsets.nrows, d.nrows) best_r2_value_maxr = best_r2_maxr[ind] one_model_maxr = h2o.get_model(result_frame_maxr["model_id"][ind, 0]) pred_maxr = one_model_maxr.predict(d) pyunit_utils.compare_frames_local( pred_maxr, pred_allsubsets, prob=1, tol=1e-6) # compare allsubsets and maxr results # r2 from result frame frame_r2_allsubsets = result_frame_allsubsets["best_r2_value"][ind, 0] # r2 from model model_r2_allsubsets = one_model_allsubsets.r2() # make sure all r2 are equal assert abs(best_r2_value_allsubsets-frame_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \ "{1}".format(best_r2_value_allsubsets, frame_r2_allsubsets) assert abs(frame_r2_allsubsets-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \ "{1}".format(model_r2_allsubsets, frame_r2_allsubsets) assert abs(best_r2_value_maxr-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, maxr best r2: {1}" \ "".format(best_r2_value_maxr, model_r2_allsubsets)
def test_modelseletion_modelselection_cross_validation(): d = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) my_y = "C21" my_x = [ "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20" ] factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for x in factorX: d[x] = d[x].asfactor() n_folds = 3 maxr_model_r = modelSelection(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="random", mode="maxr") maxr_model_r.train(training_frame=d, x=my_x, y=my_y) best_r2_maxr_r = maxr_model_r.get_best_R2_values() maxrglm_model_a = modelSelection(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="auto", mode="maxr") maxrglm_model_a.train(training_frame=d, x=my_x, y=my_y) best_r2_maxr_a = maxrglm_model_a.get_best_R2_values() # both models should provide same best R2 values pyunit_utils.equal_two_arrays(best_r2_maxr_r, best_r2_maxr_a, eps=1e-6) allsubsets_model_r = modelSelection(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="random", mode="allsubsets") allsubsets_model_r.train(training_frame=d, x=my_x, y=my_y) best_r2_allsubsets_r = allsubsets_model_r.get_best_R2_values() pyunit_utils.equal_two_arrays( best_r2_allsubsets_r, best_r2_maxr_r, eps=1e-6) # maxr and allsubsets r2 should equal allsubsets_model_a = modelSelection(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="auto", mode="allsubsets") allsubsets_model_a.train(training_frame=d, x=my_x, y=my_y) best_r2_allsubsets_a = allsubsets_model_a.get_best_R2_values() pyunit_utils.equal_two_arrays( best_r2_allsubsets_a, best_r2_maxr_a, eps=1e-6) # maxr and allsubsets r2 should equal
def test_modelselection_validation(): d = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) my_y = "C21" my_x = [ "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20" ] factor_x = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for x in factor_x: d[x] = d[x].asfactor() frames = d.split_frame(ratios=[0.8], seed=12345) train = frames[0] test = frames[1] allsubsets_model = modelSelection(seed=12345, max_predictor_number=3, mode="allsubsets") allsubsets_model.train(training_frame=train, x=my_x, y=my_y) best_r2_allsubsets = allsubsets_model.get_best_R2_values() best_predictor_allsubsets = allsubsets_model.get_best_model_predictors() allsubsets_model_v = modelSelection(seed=12345, max_predictor_number=3, mode="allsubsets") allsubsets_model_v.train(training_frame=train, validation_frame=test, x=my_x, y=my_y) best_r2_allsubsets_v = allsubsets_model_v.get_best_R2_values() best_predictor_allsubsets_v = allsubsets_model.get_best_model_predictors() maxr_model_v = modelSelection(seed=12345, max_predictor_number=3, mode="maxr") maxr_model_v.train(training_frame=train, validation_frame=test, x=my_x, y=my_y) best_r2_maxr_v = maxr_model_v.get_best_R2_values() best_predictor_maxr_v = maxr_model_v.get_best_model_predictors() # R2 values are different between the two models numSet = len(best_r2_allsubsets) for index in range(numSet): one_best_predictor_allsubsets = best_predictor_allsubsets[index] one_best_predictor_v_allsubsets = best_predictor_allsubsets_v[index] one_best_r2_allsubsets = best_r2_allsubsets[index] one_best_r2_v_allsubsets = best_r2_allsubsets_v[index] best_r2_v_maxr = best_r2_maxr_v[index] if one_best_predictor_allsubsets == one_best_predictor_v_allsubsets: assert not (one_best_r2_allsubsets == one_best_r2_v_allsubsets ), "R2 values should not equal" assert abs(one_best_r2_v_allsubsets-best_r2_v_maxr) < 1e-6, "allsubset best R2: {0}, maxr best R2: {1}. They " \ "are different.".format(one_best_r2_v_allsubsets, best_r2_v_maxr)
def test_modelselection_gaussian_coefs(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] allsubsets_model = modelSelection(seed=12345, max_predictor_number=7, mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) coefs_allsubsets = allsubsets_model.coef() coefs_norm_allsubsets = allsubsets_model.coef_norm() maxrsweep_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxrsweep") maxrsweep_model.train(training_frame=d, x=my_x, y=my_y) maxr_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) # make sure results returned by maxr and maxrsweep are the same pyunit_utils.compare_frames_local(maxr_model.result()[2:4], maxrsweep_model.result()[2:4], prob=1.0, tol=1e-6) coefs_maxr = maxr_model.coef() coefs_norm_maxr = maxr_model.coef_norm() for ind in list(range(len(coefs_allsubsets))): one_coef_allsubsets = coefs_allsubsets[ind] one_coef_norm_allsubsets = coefs_norm_allsubsets[ind] one_coef_maxr = coefs_maxr[ind] one_coef_norm_maxr = coefs_norm_maxr[ind] # coefficients obtained from accessing model_id, generate model and access the model coeffs one_model = h2o.get_model(allsubsets_model._model_json["output"] ["best_model_ids"][ind]['name']) model_coef = one_model.coef() model_coef_norm = one_model.coef_norm() # get coefficients of individual predictor subset size subset_size = ind + 1 one_model_coef = allsubsets_model.coef(subset_size) one_model_coef_norm = allsubsets_model.coef_norm(subset_size) # check coefficient dicts are equal pyunit_utils.assertCoefDictEqual(one_coef_allsubsets, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_coef_norm_allsubsets, model_coef_norm, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef, one_coef_maxr, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef_norm, one_coef_norm_maxr, 1e-6)
def test_modelselection_backward_gaussian(): predictor_elimination_order = ["C72", "C70", "C69", "C48", "C38", "C96", "C10", "C29", "C22", "C100", "C82", "C56", "C92", "C99", "C57"] eliminated_p_values = [0.9822, 0.9054, 0.7433, 0.4095, 0.1679, 0.1551, 0.0438, 0.0119, 0.0107, 0.0094, 0.0099, 0.0066, 0.0003, 0.0002, 0.0002] d = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/model_selection/maxrglm100Cols50KRowsWeighted.csv")) my_y = "response" my_x = d.names my_x.remove(my_y) my_x.remove("weight") min_predictor_num = 100-len(predictor_elimination_order) model_backward = modelSelection(seed=12345, min_predictor_number=min_predictor_num, mode="backward", family='gaussian', weights_column='weight') model_backward.train(training_frame=d, x=my_x, y=my_y) # check predictor deletion order same as in predictor_elimination_order predictor_orders = model_backward._model_json['output']['best_model_predictors'] num_models = len(predictor_orders) counter = 0 for ind in list(range(num_models-1, 0, -1)): pred_large = model_backward._model_json["output"]["best_model_predictors"][ind] pred_small = model_backward._model_json["output"]["best_model_predictors"][ind-1] predictor_removed = set(pred_large).symmetric_difference(pred_small).pop() assert predictor_removed==predictor_elimination_order[counter], "expected eliminated predictor {0}, " \ "actual eliminated predictor {1}".format(predictor_elimination_order[counter], predictor_removed) predictor_removed_index = model_backward._model_json["output"]["coefficient_names"][ind].index(predictor_removed) removed_pvalue = round(model_backward._model_json["output"]["coef_p_values"][ind][predictor_removed_index], 4) # assert p-values of coefficients removed by h2o equals to customer ones assert abs(removed_pvalue-eliminated_p_values[counter]) < 1e-6, \ "Expected p-value of eliminated coefficient: {0}. Actual: {1}. They are very different." \ "".format(eliminated_p_values[counter], removed_pvalue) counter += 1 coefs = model_backward.coef(len(pred_large)) # check coefficients result correct length assert len(coefs) == len(pred_large), "Expected coef length: {0}, Actual: {1}".format(len(coefs), len(pred_large))
def test_modelselection_serialization(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] allsubsets_model = modelSelection(seed=12345, max_predictor_number=7, mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) tmpdir = tempfile.mkdtemp() model_path_allsubsets = allsubsets_model.download_model(tmpdir) maxr_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) model_path_maxr = maxr_model.download_model(tmpdir) h2o.remove_all() d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) loaded_allsubsets_model = h2o.load_model(model_path_allsubsets) result_frame_allsubsets = loaded_allsubsets_model.result() numRows = result_frame_allsubsets.nrows modelIDs_allsubsets = loaded_allsubsets_model._model_json["output"][ "best_model_ids"] loaded_maxr_model = h2o.load_model(model_path_maxr) modelIDs_maxr = loaded_allsubsets_model._model_json["output"][ "best_model_ids"] for ind in list(range(numRows)): model_from_frame_allsubsets = h2o.get_model( result_frame_allsubsets["model_id"][ind, 0]) pred_frame_allsubsets = model_from_frame_allsubsets.predict(d) model_from_id_allsubsets = h2o.get_model( modelIDs_allsubsets[ind]['name']) pred_id_allsubsets = model_from_id_allsubsets.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_allsubsets, prob=1) model_from_id_maxr = h2o.get_model(modelIDs_maxr[ind]['name']) pred_id_maxr = model_from_id_maxr.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_maxr, prob=1)
def test_modelselection_backward_gaussian(): predictor_elimination_order = ["C15", "C33", "C164", "C144", "C27"] eliminated_p_values = [0.6702, 0.6663, 0.0157, 0.0026, 0.0002] d = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/model_selection/backwardBinomial200C50KRowsWeighted.csv" )) my_y = "response" my_x = d.names my_x.remove(my_y) my_x.remove("weight") min_predictor_num = 200 - len(predictor_elimination_order) model_backward = modelSelection(seed=12345, min_predictor_number=min_predictor_num, mode="backward", family='binomial', link='logit', weights_column='weight') model_backward.train(training_frame=d, x=my_x, y=my_y) # check predictor deletion order same as in predictor_elimination_order predictor_orders = model_backward._model_json['output'][ 'best_model_predictors'] num_models = len(predictor_orders) counter = 0 pred_ele = [] pred_pvalue = [] for ind in list(range(num_models - 1, 0, -1)): pred_large = model_backward._model_json["output"][ "best_model_predictors"][ind] pred_small = model_backward._model_json["output"][ "best_model_predictors"][ind - 1] predictor_removed = set(pred_large).symmetric_difference( pred_small).pop() pred_ele.append(predictor_removed) predictor_removed_index = model_backward._model_json["output"][ "coefficient_names"][ind].index(predictor_removed) pred_pvalue.append( round( model_backward._model_json["output"]["coef_p_values"][ind] [predictor_removed_index], 4)) counter += 1 coefs = model_backward.coef( len(pred_large)) # check coefficients result correct length assert len(coefs) == len( pred_large) + 1, "Expected coef length: {0}, Actual: {1}".format( len(coefs), len(pred_large) + 1) common_elimination = list(set(predictor_elimination_order) & set(pred_ele)) assert len(common_elimination) >= 2 print("Expected predictor elimination order: {0}".format( predictor_elimination_order)) print("Expected predictor p-values: {0}".format(eliminated_p_values)) print("Predictor elimination order: {0}".format(pred_ele)) print("Predictor p-values: {0}".format(pred_pvalue))
def test_modelselection_backward_gaussian(): predictor_elimination_order = ['C33', 'C24', 'C164', 'C66', 'C15'] eliminated_p_values = [0.9711, 0.0694, 0.0388, 0.0127, 0.0009] tst_data = h2o.import_file( pyunit_utils.locate( "bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv")) predictors = tst_data.columns[0:-1] response_col = 'response' weight = 'wt' tst_data['wt'] = 1 tst_data[tst_data['response'] == 1, 'wt'] = 100 tst_data['response'] = tst_data['response'].asfactor() min_predictor_num = 200 - len(predictor_elimination_order) model_backward = modelSelection(family='binomial', weights_column=weight, mode='backward', min_predictor_number=min_predictor_num) model_backward.train(training_frame=tst_data, x=predictors, y=response_col) # check predictor deletion order same as in predictor_elimination_order predictor_orders = model_backward._model_json['output'][ 'best_model_predictors'] num_models = len(predictor_orders) counter = 0 pred_ele = [] pred_pvalue = [] for ind in list(range(num_models - 1, 0, -1)): pred_large = model_backward._model_json["output"][ "best_model_predictors"][ind] pred_small = model_backward._model_json["output"][ "best_model_predictors"][ind - 1] predictor_removed = set(pred_large).symmetric_difference( pred_small).pop() pred_ele.append(predictor_removed) predictor_removed_index = model_backward._model_json["output"][ "coefficient_names"][ind].index(predictor_removed) pred_pvalue.append( round( model_backward._model_json["output"]["coef_p_values"][ind] [predictor_removed_index], 4)) counter += 1 coefs = model_backward.coef( len(pred_large)) # check coefficients result correct length assert len(coefs) == len( pred_large), "Expected coef length: {0}, Actual: {1}".format( len(coefs), len(pred_large)) common_elimination = list(set(predictor_elimination_order) & set(pred_ele)) assert len(common_elimination) == len(pred_ele) pyunit_utils.equal_two_arrays(pred_pvalue, eliminated_p_values, tolerance=1e-6)
def test_modelselection_cv_result_frame_model_id(): d = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv")) my_y = "C21" my_x = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"] factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for x in factorX: d[x] = d[x].asfactor() n_folds = 3 allsubsets_model = modelSelection(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="auto", mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) result_frame_allsubsets = allsubsets_model.result() maxr_model = modelSelection(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="auto", mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) result_frame_maxr = maxr_model.result() numRows = result_frame_allsubsets.nrows modelIDs_allsubsets = allsubsets_model._model_json["output"]["best_model_ids"] modelIDs_maxr = maxr_model._model_json["output"]["best_model_ids"] for ind in list(range(numRows)): model_allsubsets_from_frame = h2o.get_model(result_frame_allsubsets["model_id"][ind, 0]) pred_frame_allsubsets = model_allsubsets_from_frame.predict(d) model_allsubsets_from_id = h2o.get_model(modelIDs_allsubsets[ind]['name']) pred_id_allsubsets = model_allsubsets_from_id.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_allsubsets, prob=1) # compare results from maxr with allsubsets model_maxr_from_frame = h2o.get_model(result_frame_maxr["model_id"][ind, 0]) pred_frame_maxr = model_maxr_from_frame.predict(d) model_maxrs_from_id = h2o.get_model(modelIDs_maxr[ind]['name']) pred_id_maxr = model_maxrs_from_id.predict(d) pyunit_utils.compare_frames_local(pred_frame_maxr, pred_id_maxr, prob=1) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_maxr, prob=1)
def test_modelselection_cross_validation(): d = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) my_y = "C21" my_x = [ "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20" ] factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for x in factorX: d[x] = d[x].asfactor() n_folds = 3 fold_numbers = d.modulo_kfold_column(n_folds=n_folds) fold_numbers.set_names(["fold_numbers_modulo"]) fold_numbers2 = d.kfold_column(n_folds=n_folds, seed=12345) fold_numbers2.set_names(["fold_numbers_kfold"]) # append the fold_numbers column to the cars dataset d = d.cbind(fold_numbers) d = d.cbind(fold_numbers2) # cv model with fold assignment allsubsets_model_fa = modelSelection(seed=12345, max_predictor_number=3, fold_column="fold_numbers_modulo", mode="allsubsets") allsubsets_model_fa.train(training_frame=d, x=my_x, y=my_y) best_r2_allsubsets_fa = allsubsets_model_fa.get_best_R2_values() allsubsets_model_fk = modelSelection(seed=12345, max_predictor_number=3, fold_column="fold_numbers_kfold", mode="allsubsets") allsubsets_model_fk.train(training_frame=d, x=my_x, y=my_y) best_r2_allsubsets_fk = allsubsets_model_fk.get_best_R2_values() # both models should provide same best R2 values pyunit_utils.equal_two_arrays(best_r2_allsubsets_fa, best_r2_allsubsets_fk, eps=1e-6) # cv model with fold assignment maxr_model_fa = modelSelection(seed=12345, max_predictor_number=3, fold_column="fold_numbers_modulo", mode="maxr") maxr_model_fa.train(training_frame=d, x=my_x, y=my_y) best_r2_maxr_fa = maxr_model_fa.get_best_R2_values() maxr_model_fk = modelSelection(seed=12345, max_predictor_number=3, fold_column="fold_numbers_kfold", mode="maxr") maxr_model_fk.train(training_frame=d, x=my_x, y=my_y) best_r2_maxr_fk = maxr_model_fk.get_best_R2_values() # both models should provide same best R2 values pyunit_utils.equal_two_arrays(best_r2_allsubsets_fa, best_r2_maxr_fa, eps=1e-6) pyunit_utils.equal_two_arrays(best_r2_allsubsets_fk, best_r2_maxr_fk, eps=1e-6)
def test_modelselection_gaussian(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] model_maxr = modelSelection(seed=12345, max_predictor_number=3, mode="maxr") model_maxr.train(training_frame=d, x=my_x, y=my_y) model_allsubsets = modelSelection(seed=12345, max_predictor_number=3, mode="allsubsets") model_allsubsets.train(training_frame=d, x=my_x, y=my_y) best_r2_value_allsubsets = model_allsubsets.get_best_R2_values() best_predictor_names_allsubsets = model_allsubsets.get_best_model_predictors( ) best_r2_value_maxr = model_maxr.get_best_R2_values() # assert that model returned with one predictor found by modelselection is the best by comparing it to manual training result one_pred_r2 = [] for pred in my_x: x = [pred] m = glm(seed=12345) m.train(training_frame=d, x=x, y=my_y) one_pred_r2.append(m.r2()) best_r2 = max(one_pred_r2) assert abs(best_r2-best_r2_value_allsubsets[0]) < 1e-6, "expected best r2: {0}, allsubset: actual best r2:{1}. " \ " They are different.".format(best_r2, best_r2_value_allsubsets[0]) assert abs(best_r2-best_r2_value_maxr[0]) < 1e-6, "expected best r2: {0}, maxr: actual best r2:{1}. " \ " They are different.".format(best_r2, best_r2_value_maxr[0]) assert abs(best_r2_value_allsubsets[0]-best_r2_value_maxr[0]) < 1e-6, "allsubset best r2: {0}, maxr best r2:{1}. " \ " They are different." \ "".format(best_r2_value_allsubsets[0], best_r2_value_maxr[0]) print("Best one predictor model uses predictor: {0}".format( best_predictor_names_allsubsets[0])) my_x3 = [["AGE", "RACE", "CAPSULE"], ["AGE", "RACE", "DCAPS"], ["AGE", "RACE", "PSA"], ["AGE", "RACE", "VOL"], ["AGE", "RACE", "DPROS"], ["AGE", "CAPSULE", "DCAPS"], ["AGE", "CAPSULE", "PSA"], ["AGE", "CAPSULE", "VOL"], ["AGE", "CAPSULE", "DPROS"], ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "VOL"], ["AGE", "DCAPS", "DPROS"], ["AGE", "PSA", "VOL"], ["AGE", "PSA", "VOL"], ["AGE", "PSA", "DPROS"], ["AGE", "VOL", "DPROS"], ["RACE", "CAPSULE", "DCAPS"], ["RACE", "CAPSULE", "PSA"], ["RACE", "CAPSULE", "VOL"], ["RACE", "CAPSULE", "DPROS"], ["RACE", "DCAPS", "PSA"], ["RACE", "DCAPS", "VOL"], ["RACE", "DCAPS", "DPROS"], ["RACE", "PSA", "VOL"], ["RACE", "PSA", "DPROS"], ["RACE", "VOL", "DPROS"], ["CAPSULE", "DCAPS", "PSA"], ["CAPSULE", "DCAPS", "VOL"], ["CAPSULE", "DCAPS", "DPROS"], ["DCAPS", "PSA", "VOL"], ["DCAPS", "PSA", "DPROS"], ["DCAPS", "VOL", "DPROS"], ["PSA", "VOL", "DPROS"]] two_pred_r2 = [] for pred2 in my_x3: x = pred2 m = glm(seed=12345) m.train(training_frame=d, x=x, y=my_y) two_pred_r2.append(m.r2()) best_r2_two_pred = max(two_pred_r2) assert abs(best_r2_two_pred-best_r2_value_allsubsets[2]) < 1e-6, "expected best r2: {0}, allsubsets: actual best " \ "r2:{1}. They are different." \ "".format(best_r2_two_pred, best_r2_value_allsubsets[2]) assert abs(best_r2_two_pred-best_r2_value_maxr[2]) < 1e-6, "expected best r2: {0}, maxr: actual best " \ "r2:{1}. They are different." \ "".format(best_r2_two_pred, best_r2_value_maxr[2]) assert abs(best_r2_value_allsubsets[2]-best_r2_value_maxr[2]) < 1e-6, "allsubset best r2: {0}, maxr: actual best " \ "r2:{1}. They are different." \ "".format(best_r2_value_allsubsets[2], best_r2_value_maxr[2]) print("Best three predictors model uses predictors: {0}".format( best_predictor_names_allsubsets[2]))