def test_explanation_list_of_models_binomial_classification(): train = h2o.upload_file( pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "CAPSULE" train[y] = train[y].asfactor() # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) models = [ h2o.get_model(m[0]) for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False, header=False) ] # Test named models as well gbm = H2OGradientBoostingEstimator(model_id="my_awesome_model") gbm.train(y=y, training_frame=train) models += [gbm] # test variable importance heatmap plot assert isinstance( h2o.varimp_heatmap(models).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test model correlation heatmap plot assert isinstance( h2o.model_correlation_heatmap(models, train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test partial dependences for col in cols_to_test: assert isinstance( h2o.pd_multi_plot(models, train, col).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test learning curve for model in models: assert isinstance(model.learning_curve_plot().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close("all") # test explain assert isinstance(h2o.explain(models, train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(models, train, 1, render=False), H2OExplanation)
def test_explanation_automl_multinomial_classification(): train = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris2.csv")) y = "response" train[y] = train[y].asfactor() # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) # test variable importance heatmap plot assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(aml.varimp(use_pandas=False)) == 3 # numpy.ndarray, colnames, rownames assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame) # test model correlation heatmap plot assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(aml.model_correlation(train, use_pandas=False)) == 2 # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame) # test partial dependences for col in cols_to_test: assert isinstance(aml.pd_multi_plot(train, col, target="setosa").figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close("all") # test explain assert isinstance(aml.explain(train, render=False), H2OExplanation) # test explain row assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation) # Leaderboard slices work # test explain assert isinstance(h2o.explain(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :], train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :], train, 1, render=False), H2OExplanation)
def test_explanation_list_of_models_multinomial_classification(): train = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris2.csv")) y = "response" train[y] = train[y].asfactor() # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) models = [h2o.get_model(m[0]) for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False, header=False)] # test variable importance heatmap plot assert isinstance(h2o.varimp_heatmap(models), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test model correlation heatmap plot assert isinstance(h2o.model_correlation_heatmap(models, train), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test partial dependences for col in cols_to_test: assert isinstance(h2o.pd_multi_plot(models, train, col, target="setosa"), matplotlib.pyplot.Figure) matplotlib.pyplot.close("all") # test explain assert isinstance(h2o.explain(models, train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(models, train, 1, render=False), H2OExplanation)
def test_explanation_automl_binomial_classification(): train = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "CAPSULE" train[y] = train[y].asfactor() # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) # test variable importance heatmap plot assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test that num_of_features is propagated for n_features in [1, 3, 5]: assert n_features == len(aml.varimp_heatmap(num_of_features=n_features).figure().get_axes()[0].get_yticks()) matplotlib.pyplot.close() assert len(aml.varimp(use_pandas=False)) == 3 # numpy.ndarray, colnames, rownames assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame) # test model correlation heatmap plot assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(aml.model_correlation(train, use_pandas=False)) == 2 # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame) # test partial dependences for col in cols_to_test: assert isinstance(aml.pd_multi_plot(train, col).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test explain assert isinstance(aml.explain(train, render=False), H2OExplanation) # test explain row assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation) # Leaderboard slices work # test variable importance heatmap plot assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() leaderboard_without_SE = aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :] assert len(h2o.explanation.varimp(leaderboard_without_SE, use_pandas=False)) == 3 # numpy.ndarray, colnames, rownames assert isinstance(h2o.explanation.varimp(leaderboard_without_SE, use_pandas=True), pandas.DataFrame) # test model correlation heatmap plot assert isinstance(h2o.model_correlation_heatmap(leaderboard_without_SE, train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(h2o.explanation.model_correlation(leaderboard_without_SE, train, use_pandas=False)) == 2 # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector assert isinstance(h2o.explanation.model_correlation(leaderboard_without_SE, train, use_pandas=True), pandas.DataFrame) # test partial dependences assert isinstance(h2o.pd_multi_plot(leaderboard_without_SE, train, cols_to_test[0]).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test explain assert isinstance(h2o.explain(leaderboard_without_SE, train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(leaderboard_without_SE, train, 1, render=False), H2OExplanation)
def test_explanation_automl_regression(): train = h2o.upload_file(pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv")) train["name"] = train["name"].asfactor() y = "fare" # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) # test variable importance heatmap plot assert isinstance(aml.varimp_heatmap().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(aml.varimp(use_pandas=False)) == 3 # numpy.ndarray, colnames, rownames assert isinstance(aml.varimp(use_pandas=True), pandas.DataFrame) # test model correlation heatmap plot assert isinstance(aml.model_correlation_heatmap(train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() assert len(aml.model_correlation(train, use_pandas=False)) == 2 # numpy.ndarray, colnames and rownames both in the same order => represented by just one vector assert isinstance(aml.model_correlation(train, use_pandas=True), pandas.DataFrame) # test partial dependences for col in cols_to_test: try: assert isinstance(aml.pd_multi_plot(train, col).figure(), matplotlib.pyplot.Figure) except ValueError: assert col == "name", "'name' is a string column which is not supported." matplotlib.pyplot.close("all") # test explain assert isinstance(aml.explain(train, render=False), H2OExplanation) # test explain row assert isinstance(aml.explain_row(train, 1, render=False), H2OExplanation) # test shortening model ids work correctly from h2o.explanation._explain import _shorten_model_ids model_ids = aml.leaderboard.as_data_frame()["model_id"] shortened_model_ids = _shorten_model_ids(model_ids) assert len(set(model_ids)) == len(set(shortened_model_ids)) for i in range(len(model_ids)): assert len(model_ids[i]) > len(shortened_model_ids[i]) # Leaderboard slices work # test explain assert isinstance(h2o.explain(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :], train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(aml.leaderboard[~aml.leaderboard["model_id"].grep("^Stacked", output_logical=True), :], train, 1, render=False), H2OExplanation)