def test_custom_metric_reload():
    custom_metric = h2o.upload_custom_metric(CustomNullFunc, func_name="custom_mm")
    (model1, f_test1) = regression_model(H2OGradientBoostingEstimator, custom_metric)
    assert_all_metrics_equal(model1, f_test1, "custom_mm", 0)
    # Redefine custom metric and build a new model
    custom_metric = h2o.upload_custom_metric(CustomOneFunc, func_name="custom_mm")
    (model2, f_test2) = regression_model(H2OGradientBoostingEstimator, custom_metric)
    assert_all_metrics_equal(model2, f_test2, "custom_mm", 1)
    def test_custom_metric(self):
        from custom_metric_class import WeightedFalseNegativeLossMetric
        train_path = "file://" + unit_test_utils.locate("smalldata/loan.csv")
        train = h2o.import_file(train_path, destination_frame="loan_train")
        train["bad_loan"] = train["bad_loan"].asfactor()

        y = "bad_loan"
        x = train.col_names
        x.remove(y)
        x.remove("int_rate")

        train["weight"] = train["loan_amnt"]

        weighted_false_negative_loss_func = h2o.upload_custom_metric(
            WeightedFalseNegativeLossMetric,
            func_name="WeightedFalseNegativeLoss",
            func_file="weighted_false_negative_loss.py")
        from h2o.estimators import H2OGradientBoostingEstimator
        gbm = H2OGradientBoostingEstimator(
            model_id="gbm.hex",
            custom_metric_func=weighted_false_negative_loss_func)
        gbm.train(y=y, x=x, training_frame=train, weights_column="weight")

        perf = gbm.model_performance()
        self.assertEquals(perf.custom_metric_name(),
                          "WeightedFalseNegativeLoss")
        self.assertEquals(perf.custom_metric_value(), 0.24579011595430142)
def test_custom_metric_from_str():
    custom_metric = h2o.upload_custom_metric(CustomOneFuncStr,
                                             class_name="CustomOneFunc",
                                             func_name="custom_mm")
    (model2, f_test2) = regression_model(H2OGradientBoostingEstimator,
                                         custom_metric)
    assert_all_metrics_equal(model2, f_test2, "custom_mm", 1)
def testCustomMetric(loanDatasetPath):
    train = h2o.import_file(loanDatasetPath, destination_frame="loan_train")
    train["bad_loan"] = train["bad_loan"].asfactor()

    y = "bad_loan"
    x = train.col_names
    x.remove(y)
    x.remove("int_rate")

    train["weight"] = train["loan_amnt"]

    weightedFalseNegativeLossFunc = h2o.upload_custom_metric(WeightedFalseNegativeLossMetric,
                                                             func_name="WeightedFalseNegativeLoss",
                                                             func_file="weighted_false_negative_loss.py")
    gbm = H2OGradientBoostingEstimator(model_id="gbm.hex", custom_metric_func=weightedFalseNegativeLossFunc)
    gbm.train(y=y, x=x, training_frame=train, weights_column="weight")

    perf = gbm.model_performance()
    assert perf.custom_metric_name() == "WeightedFalseNegativeLoss"
    assert perf.custom_metric_value() == 0.24579011595430142
def test_custom_metric_from_str():
    custom_metric = h2o.upload_custom_metric(CustomOneFuncStr, class_name="CustomOneFunc", func_name="custom_mm")
    (model2, f_test2) = regression_model(H2OGradientBoostingEstimator, custom_metric)
    assert_all_metrics_equal(model2, f_test2, "custom_mm", 1)
示例#6
0
import pandas as pd
import sys
sys.path.append("/home/jeremy/Documents/rinseOverRun/src")
from dataModeling.mape import MapeMetric  # noqa
import h2o  # noqa
from h2o.estimators import H2OGradientBoostingEstimator, H2ORandomForestEstimator  # noqa
import matplotlib.pyplot as plt  # noqa
import numpy as np  # noqa
from sklearn.preprocessing import MinMaxScaler  # noqa

h2o.init(port=42222, nthreads=-1)
mape_func = h2o.upload_custom_metric(MapeMetric, func_name="MAPE", func_file="mape.py")

train = pd.read_csv("data/processed/train.csv", index_col=0)
valid = pd.read_csv("data/processed/valid.csv", index_col=0)

scaler = MinMaxScaler()
target = 'final_rinse_total_turbidity_liter'
train[[target]] = scaler.fit_transform(train[[target]])
valid[[target]] = scaler.transform(valid[[target]])

hf, vf = h2o.H2OFrame(train), h2o.H2OFrame(valid)
gbm = H2OGradientBoostingEstimator(model_id="Ayaya_gbm",
                                   seed=1337,
                                   ntrees=300,
                                   min_split_improvement=1e-4,
                                   learn_rate=1e-3,
                                   stopping_metric="custom",
                                   stopping_rounds=10,
                                   stopping_tolerance=0.001,
                                   custom_metric_func=mape_func)
示例#7
0
def custom_logloss_mm():
    return h2o.upload_custom_metric(CustomLoglossFunc,
                                    func_name="logloss",
                                    func_file="mm_logloss.py")
示例#8
0
def custom_rmse_mm():
    return h2o.upload_custom_metric(CustomRmseFunc,
                                    func_name="rmse",
                                    func_file="mm_rmse.py")
示例#9
0
def custom_mae_mm():
    return h2o.upload_custom_metric(CustomMaeFunc,
                                    func_name="mae",
                                    func_file="mm_mae.py")
def custom_rmse_mm():
    return h2o.upload_custom_metric(CustomRmseFunc, func_name="rmse", func_file="mm_rmse.py")
def custom_mae_mm():
    return h2o.upload_custom_metric(CustomMaeFunc, func_name="mae", func_file="mm_mae.py")
    def train_gradientboosting(self, train: h2o.H2OFrame,
                               x: List[str],
                               y: str,
                               weight: str,
                               cost_matrix_loss_metric: bool) -> H2OGenericEstimator:
        """ Use a  H2O gradient boosting base model and a gridsearch to build model

        Args:
            train (h2o dataframe): training data containing columns x, y, and weight
            x (list of str): column names of model features
            y (list of str): column name of ground truth
            weight (str): column name of row weights
            cost_matrix_loss_metric (bool): indicates if a custom loss function should be used in model selection

        Return
            H2OGenericEstimator: best model out of the training grid

        """

        def sort_models(grid: H2OGridSearch) -> List[list]:
            """ Sorts models in the grid by their custom_metric_value or the score reported by the custom
                metric set at model declaration.
            Args:
                grid (H2OGridSearch): a grid search object containing models with the custom metric
            Returns:
                Sorted list of decreasing custom_metric_value
            """
            functioning_list_of_models = []
            for model_name in grid.model_ids:
                try:
                    result = [h2o.get_model(model_name).model_performance(xval=True).custom_metric_value(),
                              model_name]
                    functioning_list_of_models.append(result)
                except AttributeError:
                    # Some models fail because they don't have a custom_metric_value, it's unclear why at this time
                    print(f"Error with {x}")
                    pass

            return sorted(functioning_list_of_models)

        def grid_train(base_model: H2OGradientBoostingEstimator, search_time: int) -> H2OGridSearch:
            """ Given base model train a search grid to find the optimum hyper parameters
            Args:
                base_model (H2OGradientBoostingEstimator): model that should be used in hyper parameter search
                search_time (int): max time in seconds that h2o should spend searching for a model in the grid
            Return:
                H2OGridSearch : trained grid
            """
            gbm_hyper_parameters = {'learn_rate': [0.01, 0.1],
                                    'max_depth': [3, 5, 9],
                                    'sample_rate': [0.8, 1.0],
                                    'col_sample_rate': [0.2, 0.5, 1.0]}
            logging.info(f"Searching Hyper Parameter Space:\n {gbm_hyper_parameters}")
            grid = H2OGridSearch(base_model,
                                 gbm_hyper_parameters,
                                 search_criteria={'strategy': "RandomDiscrete", 'max_runtime_secs': search_time})
            grid.train(x=x, y=y, training_frame=train, weights_column=weight, grid_id="gbm_grid")
            return grid

        def get_cost_matrix_loss_metric_class() -> object:
            """ This function modifies the text in the file utils_model_metrics to include the cost dictionary in
                this instance before importing the file. The strategy is messy and I don't believe it is the correct
                way to do this, but it is the only way I could find to complete the tasks inside the allotted time
                today.
            Returns the class CostMatrixLossMetric with cost dictionary overwritten
            """
            file_path = os.path.join(self.dir_path, 'utils_model_metrics.py')
            with open(file_path, 'r') as file:
                file_data = file.read()
            target = r"\{'cost_tp': -?\d*\.?\d, 'cost_fp': -?\d*\.?\d, 'cost_tn': -?\d*\.?\d*, 'cost_fn': -?\d*\.?\d*\}"
            file_data = re.sub(target, str(self.inverse_costs), file_data)
            with open(file_path, 'w') as file:
                file.write(file_data)
                print("file written")

            from .utils_model_metrics import CostMatrixLossMetric
            return CostMatrixLossMetric

        if cost_matrix_loss_metric:
            # If cost_matrix_loss_metric upload it to cluster and include it in base model

            cost_matrix_loss_metric_func = h2o.upload_custom_metric(get_cost_matrix_loss_metric_class(),
                                                                    func_name="CostMatrixLossMetric",
                                                                    func_file="cost_matrix_loss_metric.py")
            base_model = H2OGradientBoostingEstimator(custom_metric_func=cost_matrix_loss_metric_func,
                                                      nfolds=3)
            gbm_grid = grid_train(base_model, self.search_time)
            # Custom metrics are not available in .get_grid so we must use our own function to select the
            # best model
            best_model = h2o.get_model(sort_models(gbm_grid)[0][1])
        else:
            base_model = H2OGradientBoostingEstimator(nfolds=3)
            gbm_grid = grid_train(base_model, self.search_time)
            best_model = gbm_grid.get_grid(sort_by='auc', decreasing=True).models[0]

        return best_model