def test_varimp(): train = h2o.upload_file(pyunit_utils.locate("smalldata/wine/winequality-redwhite-no-BOM.csv")) y = "quality" # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) assert aml.varimp(use_pandas=True).shape == (12, 5) assert h2o.explanation.varimp(aml.leaderboard[aml.leaderboard["model_id"].grep("Stacked", invert=True, output_logical=True), :].head(3), num_of_features=3, use_pandas=True).shape == (3, 3) varimp_1 = aml.varimp(use_pandas=False) assert varimp_1[0].shape == (12, 5) assert len(varimp_1[1]) == 5 assert len(varimp_1[2]) == 12 varimp_2 = h2o.explanation.varimp(aml.leaderboard[aml.leaderboard["model_id"].grep("Stacked", invert=True, output_logical=True), :].head(4), num_of_features=3, use_pandas=False) assert varimp_2[0].shape == (3, 4) assert len(varimp_2[1]) == 4 assert len(varimp_2[2]) == 3 assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure) assert isinstance(h2o.varimp_heatmap(aml.leaderboard[aml.leaderboard["model_id"].grep("Stacked", invert=True, output_logical=True), :].head(3), num_of_features=3).figure(), matplotlib.pyplot.Figure)
def test_explanation_list_of_models_binomial_classification(): train = h2o.upload_file( pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "CAPSULE" train[y] = train[y].asfactor() # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) models = [ h2o.get_model(m[0]) for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False, header=False) ] # Test named models as well gbm = H2OGradientBoostingEstimator(model_id="my_awesome_model") gbm.train(y=y, training_frame=train) models += [gbm] # test variable importance heatmap plot assert isinstance( h2o.varimp_heatmap(models).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test model correlation heatmap plot assert isinstance( h2o.model_correlation_heatmap(models, train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test partial dependences for col in cols_to_test: assert isinstance( h2o.pd_multi_plot(models, train, col).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test learning curve for model in models: assert isinstance(model.learning_curve_plot().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close("all") # test explain assert isinstance(h2o.explain(models, train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(models, train, 1, render=False), H2OExplanation)
def test_varimp_heatmap_model_correlation_heatmap(): train = h2o.upload_file( pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "CAPSULE" train[y] = train[y].asfactor() # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) models = [ h2o.get_model(m[0]) for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False, header=False) ] # Test named models as well gbm = H2OGradientBoostingEstimator(model_id="my_awesome_model") gbm.train(y=y, training_frame=train) models += [gbm] with TemporaryDirectory() as tmpdir: path1 = "{}/plot1.png".format(tmpdir) path2 = "{}/plot2.png".format(tmpdir) test_plot_result_saving( h2o.varimp_heatmap(models), path2, h2o.varimp_heatmap(models, save_plot_path=path1), path1) test_plot_result_saving( h2o.model_correlation_heatmap(models, train), path2, h2o.model_correlation_heatmap(models, train, save_plot_path=path1), path1) h2o.varimp_heatmap(models)
def test_explanation_list_of_models_multinomial_classification(): train = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris2.csv")) y = "response" train[y] = train[y].asfactor() # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) models = [h2o.get_model(m[0]) for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False, header=False)] # test variable importance heatmap plot assert isinstance(h2o.varimp_heatmap(models), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test model correlation heatmap plot assert isinstance(h2o.model_correlation_heatmap(models, train), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test partial dependences for col in cols_to_test: assert isinstance(h2o.pd_multi_plot(models, train, col, target="setosa"), matplotlib.pyplot.Figure) matplotlib.pyplot.close("all") # test explain assert isinstance(h2o.explain(models, train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(models, train, 1, render=False), H2OExplanation)