示例#1
0
def buildModelMetricsCheck(train_data, family):
    x = []
    y = "response"
    if not (family == 'gaussian'):
        train_data[y] = train_data[y].asfactor()
    frames = train_data.split_frame(ratios=[0.9], seed=12345)

    h2o_model = H2OGeneralizedAdditiveEstimator(family=family,
                                                gam_columns=["C1"],
                                                seed=12345)
    h2o_model.train(x=x,
                    y=y,
                    training_frame=frames[0],
                    validation_frame=frames[1])

    h2o_model2 = H2OGeneralizedAdditiveEstimator(family=family,
                                                 gam_columns=["C1"],
                                                 seed=12345)
    h2o_model2.train(x=x,
                     y=y,
                     training_frame=frames[0],
                     validation_frame=frames[1])

    # check and make sure coefficient does not contain predictor column
    coeffNames = h2o_model.coef().keys()
    assert not "C1" in coeffNames, "Not expecting C1 to be a coefficient but it is."

    # check and make sure both model produce the same metrics
    if family == 'gaussian':
        assert h2o_model.mse() == h2o_model2.mse(
        ), "Expected model MSE: {0}, Actual: {1}".format(
            h2o_model.mse(), h2o_model2.mse())
    else:
        assert h2o_model.logloss() == h2o_model2.logloss(), "Expected model logloss: {0}, Actual: " \
                                                            "{1}".format(h2o_model.logloss(), h2o_model2.logloss())
def test_gam_knots_key():
    print("Checking coefficients and variable importance for multinomial")
    knots1 = [-49.98693927762423, -25.286098564527954, 0.44703511170863297, 25.50661829462607, 49.97312855846752]
    frameKnots1 = h2o.H2OFrame(python_obj=knots1)
    knots2 = [-49.99386508664034, -25.275868426388616, 0.012500153211602433, 25.13371167580791, 49.98738587466542]
    frameKnots2 = h2o.H2OFrame(python_obj=knots2)
    knots3 = [-49.99241697497996, -24.944012655490237, 0.1578389050436152, 25.296897954643736, 49.9876932143425]
    frameKnots3 = h2o.H2OFrame(python_obj=knots3)
    h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    y = "C21"
    x=["C1","C2"]
    numKnots = [5,5,5]
    h2o_model = H2OGeneralizedAdditiveEstimator(family='gaussian', gam_columns=["C11","C12","C13"],  scale = [1,1,1], 
                                                num_knots=numKnots, bs=[2, 2, 0], seed=12345,
                                                knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key])
    h2o_model.train(x=x, y=y, training_frame=h2o_data)
    h2oCoeffs = h2o_model.coef()
    h2o_model2 = H2OGeneralizedAdditiveEstimator(family='gaussian', gam_columns=["C11","C12","C13"],  scale = [1,1,1],
                                                 num_knots=numKnots, bs=[2, 2, 0], seed=12345)
    h2o_model2.train(x=x, y=y, training_frame=h2o_data)
    h2oCoeffs2 = h2o_model2.coef()

    keyNames = h2oCoeffs.keys()
    for kNames in keyNames:
        assert abs(h2oCoeffs[kNames]-h2oCoeffs2[kNames]) < 1e-6, "expected coefficients: {0}.  actual coefficients: " \
                                                                 "{1}".format(h2oCoeffs[kNames], h2oCoeffs2[kNames])
    print("gam knot keys test completed successfully")
示例#3
0
def buildModelScaleParam(train_data, y, gamX, family):
    numKnots = [5, 6, 7]
    x = ["C1", "C2"]
    h2o_model = H2OGeneralizedAdditiveEstimator(family=family,
                                                gam_columns=gamX,
                                                scale=[0.001, 0.001, 0.001],
                                                bs=[0, 1, 2],
                                                num_knots=numKnots)
    h2o_model.train(x=x, y=y, training_frame=train_data)
    h2o_model2 = H2OGeneralizedAdditiveEstimator(family=family,
                                                 gam_columns=gamX,
                                                 scale=[10, 10, 10],
                                                 num_knots=numKnots,
                                                 bs=[0, 1, 2])
    h2o_model2.train(x=x, y=y, training_frame=train_data)
    if family == 'binomial':
        logloss1 = h2o_model.logloss()
        logloss2 = h2o_model2.logloss()
        assert not (
            logloss1 == logloss2
        ), "logloss from models with different scale parameters should be different but is not."
    else:
        mse1 = h2o_model.mse()
        mse2 = h2o_model2.mse()
        assert not (
            mse1 == mse2
        ), "mse from models with different scale parameters should be different but is not."
def test_gam_knots_key():
    # bad gam_column with not enough values is chosen to be gam_column
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()
    myY = "CAPSULE"
    myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"]
    h2o_data[myY] = h2o_data[myY].asfactor()
    try:
        h2o_model = H2OGeneralizedAdditiveEstimator(family='binomial',
                                                    gam_columns=["GLEASON"],
                                                    bs=[2],
                                                    num_knots=[12])
        h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
        assert False, "Should have throw exception due to bad gam_column choice"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert "does have not enough values to generate well-defined knots" in temp, "wrong error message received."

    # knots not chosen in ascending error and corresponding error message
    knots1 = [-0.98143075, -1.99905699, 0.02599159, 1.00770987, 1.99942290]
    frameKnots1 = h2o.H2OFrame(python_obj=knots1)

    try:
        h2o_model = H2OGeneralizedAdditiveEstimator(family='binomial',
                                                    gam_columns=["GLEASON"],
                                                    knot_ids=[frameKnots1.key],
                                                    bs=[2])
        h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
        assert False, "Should have throw exception due to bad knot location choices"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert "knots not sorted in ascending order for gam_column" in temp, "wrong error message received."
def link_functions_tweedie_vpow():
    np.random.seed(25)

    data = {
        "predictor": np.random.uniform(400, 800, 15),
        "target": np.random.uniform(0.7, 1.4, 15),
        "weight_1": [1] * 15,
        "weight_2": [3] * 15,
    }

    df = h2o.H2OFrame(pd.DataFrame(data))

    model_w1 = H2OGeneralizedAdditiveEstimator(family='gaussian',
                                               gam_columns=["predictor"],
                                               scale=[1],
                                               bs=[2],
                                               weights_column='weight_1')
    model_w2 = H2OGeneralizedAdditiveEstimator(family='gaussian',
                                               gam_columns=["predictor"],
                                               scale=[1],
                                               bs=[2],
                                               weights_column='weight_2')
    model = H2OGeneralizedAdditiveEstimator(family='gaussian',
                                            gam_columns=["predictor"],
                                            scale=[1],
                                            bs=[2])

    model_w1.train(x=["predictor"], y="target", training_frame=df)
    model_w2.train(x=["predictor"], y="target", training_frame=df)
    model.train(x=["predictor"], y="target", training_frame=df)
示例#6
0
def test_gam_model_predict():
    print("Checking early-stopping for binomial")
    print("Preparing for data....")
    h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    h2o_data["C3"] = h2o_data["C3"].asfactor()
    h2o_data["C4"] = h2o_data["C4"].asfactor()
    h2o_data["C5"] = h2o_data["C5"].asfactor()
    h2o_data["C6"] = h2o_data["C6"].asfactor()
    h2o_data["C7"] = h2o_data["C7"].asfactor()
    h2o_data["C8"] = h2o_data["C8"].asfactor()
    h2o_data["C9"] = h2o_data["C9"].asfactor()
    h2o_data["C10"] = h2o_data["C10"].asfactor()
    myY = "C21"
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    splits = h2o_data.split_frame(ratios=[0.8], seed=12345)
    train = splits[0]
    test = splits[1]
    early_stop_metrics = ["logloss", "AUC"]
    early_stop_valid_metrics = ["validation_logloss", "validation_auc"]
    max_stopping_rounds = 3  # maximum stopping rounds allowed to be used for early stopping metric
    max_tolerance = 0.1  # maximum tolerance to be used for early stopping metric
    bigger_is_better = [False, True]
    print("Building a GAM model without early stop")
    h2o_model_no_early_stop = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["C11"],  scale = [0.0001], 
                                                score_each_iteration=True)
    h2o_model_no_early_stop.train(x=list(range(0,20)), y=myY, training_frame=train, validation_frame=test)

    for ind in range(len(early_stop_metrics)):
        print("Building early-stop model")
        h2o_model = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["C11"], scale = [0.0001], 
                                                    stopping_rounds=max_stopping_rounds,score_each_iteration=True, 
                                                    stopping_metric=early_stop_metrics[ind],
                                                    stopping_tolerance=max_tolerance)
        h2o_model.train(x=list(range(0,20)), y="C21", training_frame=train, validation_frame=test)
        metric_list1 = \
            pyunit_utils.extract_field_from_twoDimTable(
                h2o_model._model_json["output"]["glm_scoring_history"].col_header,
                h2o_model._model_json["output"]["glm_scoring_history"].cell_values,
                early_stop_valid_metrics[ind])
        print("Checking if early stopping has been done correctly for {0}.".format(early_stop_metrics[ind]))
        assert pyunit_utils.evaluate_early_stopping(metric_list1, max_stopping_rounds, max_tolerance,
                                                    bigger_is_better[ind]), \
            "Early-stopping was not incorrect."

    print("Check if lambda_search=True, early-stop enabled, an error should be thrown.")
    try:
        h2o_model = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["C11"], scale = [0.0001],
                                                stopping_rounds=max_stopping_rounds,score_each_iteration=True,
                                                stopping_metric=early_stop_metrics[ind],
                                                stopping_tolerance=max_tolerance, lambda_search=True, nlambdas=3)
        h2o_model.train(x=list(range(0,20)), y=myY, training_frame=train, validation_frame=test)
        assert False, "Exception should have been risen when lambda_search=True and early stop is enabled"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("early stop:  cannot run when lambda_search=True.  Lambda_search has its own early-stopping "
                "mechanism" in temp), "Wrong exception was received."
        print("early-stop test passed!") 
def test_compare_R():
    myX = [
        'c_0', 'c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'c_7', 'c_8', 'c_9',
        'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10'
    ]
    myY = 'response'
    gamCols = [["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]]
    bsT = [1, 1, 1]
    scaleP = [0.001, 0.001, 0.001]
    numKnots = [10, 10, 12]
    print("Comparing H2O and R GAM performance for binomial")
    dataBinomial = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv"))
    dataBinomial["C3"] = dataBinomial["C3"].asfactor()
    dataBinomial["C7"] = dataBinomial["C7"].asfactor()
    dataBinomial["C8"] = dataBinomial["C8"].asfactor()
    dataBinomial["C10"] = dataBinomial["C10"].asfactor()
    dataBinomial["response"] = dataBinomial["response"].asfactor()
    frames = dataBinomial.split_frame(ratios=[0.8], seed=1234)
    trainB = frames[0]
    testB = frames[1]
    gamB = H2OGeneralizedAdditiveEstimator(family='binomial',
                                           gam_columns=gamCols,
                                           bs=bsT,
                                           scale=scaleP,
                                           num_knots=numKnots,
                                           lambda_search=True)
    gamB.train(x=myX, y=myY, training_frame=trainB, validation_frame=testB)
    gamPred = gamB.predict(testB)
    temp = gamPred['predict'] == testB['response']
    gamBacc = 1 - temp.mean()[0, 0]
    rAcc = 0.01457801
    print("R accuracy: {0}, H2O accuracy: {1}.".format(rAcc, gamBacc))
    assert gamBacc <= rAcc, "R mean error rate: {0}, H2O mean error rate: {1}. R performs better." \
                                          "".format(rAcc, gamBacc)
    print("Comparing H2O and R GAM performance for gaussian")
    dataGaussian = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/gam_test/synthetic_20Cols_gaussian_20KRows.csv"))
    dataGaussian["C3"] = dataGaussian["C3"].asfactor()
    dataGaussian["C7"] = dataGaussian["C7"].asfactor()
    dataGaussian["C8"] = dataGaussian["C8"].asfactor()
    dataGaussian["C10"] = dataGaussian["C10"].asfactor()
    frames = dataGaussian.split_frame(ratios=[0.8], seed=1234)
    trainB = frames[0]
    testB = frames[1]
    gamG = H2OGeneralizedAdditiveEstimator(family='gaussian',
                                           gam_columns=gamCols,
                                           bs=bsT,
                                           scale=scaleP,
                                           num_knots=numKnots,
                                           lambda_search=True)
    gamG.train(x=myX, y=myY, training_frame=trainB, validation_frame=testB)
    gamMSE = gamG.model_performance(valid=True).mse()
    rMSE = 0.0006933308
    print("R MSE: {0}, H2O MSE: {1}.".format(rMSE, gamMSE))
    assert gamMSE <= rMSE, "R MSE: {0}, H2O MSE: {1}. R performs better." \
                           "".format(rMSE, gamMSE)
def test_gridsearch():
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv"))
    h2o_data['response'] = h2o_data['response'].asfactor()
    h2o_data['C3'] = h2o_data['C3'].asfactor()
    h2o_data['C7'] = h2o_data['C7'].asfactor()
    h2o_data['C8'] = h2o_data['C8'].asfactor()
    h2o_data['C10'] = h2o_data['C10'].asfactor()
    names = h2o_data.names
    myY = "response"
    myX = names.remove(myY)
    search_criteria = {'strategy': 'Cartesian'}
    hyper_parameters = {
        'lambda': [1, 2],
        'subspaces': [{
            'scale': [[0.001], [0.0002]],
            'num_knots': [[5], [10]],
            'bs': [[1], [0]],
            'gam_columns': [[["c_0"]], [["c_1"]]]
        }, {
            'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]],
            'bs': [[1, 1, 1], [0, 1, 1]],
            'num_knots': [[5, 10, 12], [6, 11, 13]],
            'gam_columns': [[["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                            [["c_1"], ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]
        }]
    }
    hyper_parameters2 = {
        'lambda': [1, 2],
        'subspaces': [{
            'scale': [[0.001], [0.0002]],
            'num_knots': [[5], [10]],
            'bs': [[1], [0]],
            'gam_columns': [[["c_0"]], [["c_1"]]]
        }, {
            'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]],
            'bs': [[1, 1, 1], [0, 1, 1]],
            'num_knots': [[5, 10, 12], [6, 11, 13]],
            'gam_columns': [["c_0", ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                            ["c_1", ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]
        }]
    }
    h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(
        family="binomial", keep_gam_cols=True),
                              hyper_params=hyper_parameters,
                              search_criteria=search_criteria)
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    h2o_model2 = H2OGridSearch(H2OGeneralizedAdditiveEstimator(
        family="binomial", keep_gam_cols=True),
                               hyper_params=hyper_parameters2,
                               search_criteria=search_criteria)
    h2o_model2.train(x=myX, y=myY, training_frame=h2o_data)
    # compare two models by checking their coefficients.  They should be the same
    for index in range(0, len(h2o_model)):
        model1 = h2o_model[index]
        model2 = h2o_model2[index]
        pyunit_utils.assertEqualCoeffDicts(model1.coef(),
                                           model2.coef(),
                                           tol=1e-6)
示例#9
0
def buildModelMetricsCheck(train_data, test_data, model_test_data, y, gamX,
                           family):
    numKnots = [5, 6, 7]
    x = ["C1", "C2"]
    numCoeffs = len(train_data["C1"].categories()) + len(
        train_data["C2"].categories()) + sum(numKnots) + 1 - len(numKnots)
    h2o_model = H2OGeneralizedAdditiveEstimator(family=family,
                                                gam_columns=gamX,
                                                scale=[1, 1, 1],
                                                num_knots=numKnots,
                                                standardize=True,
                                                Lambda=[0],
                                                alpha=[0],
                                                max_iterations=3)
    h2o_model.train(x=x, y=y, training_frame=train_data)
    if family == 'binomial':
        h2o_model.auc()
        h2o_model.aic()
        h2o_model.logloss()
        h2o_model.null_deviance()
        h2o_model.residual_deviance()
    elif family == 'multinomial':
        h2o_model.null_deviance()
        h2o_model.residual_deviance()
    else:
        h2o_model.mse()
        h2o_model.null_deviance()
        h2o_model.residual_deviance()
 def train_models(self):
     self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(
         family = "gaussian", gam_columns = ["C11", "C12", "C13"],
         keep_gam_cols = True), self.hyper_parameters)
     self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
     for model in self.manual_gam_models:
         model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
def link_functions_tweedie_vpow():
    np.random.seed(1234)
    n_rows = 10

    data = {
        "X1": np.random.randn(n_rows),
        "X2": np.random.randn(n_rows),
        "X3": np.random.randn(n_rows),
        "W": np.random.choice([10, 20], size=n_rows),
        "Y": np.random.choice([0, 0, 0, 0, 0, 10, 20, 30], size=n_rows)
    }

    train = h2o.H2OFrame(pd.DataFrame(data))
    test = train.drop("W")
    print(train)
    h2o_model = H2OGeneralizedAdditiveEstimator(family="tweedie",
                                                gam_columns=["X3"],
                                                weights_column="W",
                                                lambda_=0,
                                                tweedie_variance_power=1.5,
                                                bs=[2],
                                                tweedie_link_power=0)
    h2o_model.train(x=["X1", "X2"], y="Y", training_frame=train)

    predict_w = h2o_model.predict(train)
    predict = h2o_model.predict(test) # scoring without weight column
    # should produce same frame
    pyunit_utils.compare_frames_local(predict_w, predict, prob=1, tol=1e-6)
 def setup_data(self):
     """
     This function performs all initializations necessary:
     load the data sets and set the training set indices and response column index
     """
     self.h2o_data = h2o.import_file(path=pyunit_utils.locate(
         "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
     self.h2o_data["C1"] = self.h2o_data["C1"].asfactor()
     self.h2o_data["C2"] = self.h2o_data["C2"].asfactor()
     self.myX = ["C1", "C2"]
     self.myY = "C21"
     for alpha in self.hyper_parameters["alpha"]:
         for subspace in self.hyper_parameters["subspaces"]:
             for scale in subspace['scale']:
                 for gam_columns in subspace['gam_columns']:
                     for num_knots in subspace['num_knots']:
                         for bs in subspace['bs']:
                             self.manual_gam_models.append(
                                 H2OGeneralizedAdditiveEstimator(
                                     family="gaussian",
                                     gam_columns=gam_columns,
                                     keep_gam_cols=True,
                                     scale=scale,
                                     num_knots=num_knots,
                                     alpha=alpha,
                                     bs=bs))
示例#13
0
 def setup_data(self):
     """
     This function performs all initializations necessary:
     load the data sets and set the training set indices and response column index
     """
     self.h2o_data = h2o.import_file(path=pyunit_utils.locate(
         "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
     self.h2o_data["C1"] = self.h2o_data["C1"].asfactor()
     self.h2o_data["C2"] = self.h2o_data["C2"].asfactor()
     self.myX = ["C1", "C2"]
     self.myY = "C21"
     for lambda_param in self.hyper_parameters['lambda']:
         for alpha_param in self.hyper_parameters['alpha']:
             for scale_param in self.hyper_parameters['scale']:
                 for num_knots_param in self.hyper_parameters['num_knots']:
                     self.manual_gam_models.append(
                         H2OGeneralizedAdditiveEstimator(
                             family="gaussian",
                             gam_columns=["C11", "C12", "C13"],
                             keep_gam_cols=True,
                             scale=scale_param,
                             bs=[2, 2, 0],
                             num_knots=num_knots_param,
                             alpha=alpha_param,
                             lambda_=lambda_param))
def test_gam_transformed_frame_serialization():
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
    ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    myX = ["C1", "C2"]
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial",
                                                gam_columns=["C6", "C7", "C8"],
                                                keep_gam_cols=True,
                                                scale=[1, 1, 1],
                                                num_knots=[5, 5, 5])
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    gam_frame = h2o.get_frame(
        h2o_model._model_json["output"]["gam_transformed_center_key"])
    tmpdir = tempfile.mkdtemp()
    filename = os.path.join(tmpdir, "gamXFrame.csv")
    h2o.download_csv(gam_frame, filename)
    model_path = h2o.save_model(h2o_model, tmpdir)

    h2o.remove_all()
    loaded_model = h2o.load_model(model_path)
    gam_frame_loaded = h2o.get_frame(
        loaded_model._model_json["output"]["gam_transformed_center_key"])
    gam_frame_original = h2o.import_file(filename)
    pyunit_utils.compare_frames_local(gam_frame_loaded[2:15],
                                      gam_frame_original[2:15],
                                      prob=1,
                                      tol=1e-6)
    print("Test completed.")
def knots_error():
    # load and prepare California housing dataset
    np.random.seed(1234)
    data = h2o.H2OFrame(
        python_obj={
            'C1': list(np.random.randint(0, 9, size=1000)),
            'target': list(np.random.randint(0, 2, size=1000))
        })
    # use only 3 features and transform into classification problem
    feature_names = ['C1']
    data['target'] = data['target'].asfactor()
    # split into train and validation sets
    train, test = data.split_frame([0.8], seed=1234)
    # build the GAM model
    h2o_model = H2OGeneralizedAdditiveEstimator(
        family='binomial',
        gam_columns=feature_names,
        scale=[1],
        num_knots=[10],
    )
    try:
        h2o_model.train(x=feature_names, y='target', training_frame=train)
        assert False, "Number of knots validation should have failed"
    except Exception as ex:
        exception = str(ex)
        assert ("H2OModelBuilderIllegalArgumentException" in exception)
        assert ("has cardinality lower than the number of knots" in exception)
        assert (
            "chosen gam_column C1 does have not enough values to generate well-defined knots"
            in exception)
        print("Error correctly raised when cardinality < num_knots")
示例#16
0
def gam_train_metrics_recalculate(family):
    np.random.seed(1234)
    n_rows = 1000

    data = {
        "X1": np.random.randn(n_rows),
        "X2": np.random.randn(n_rows),
        "X3": np.random.randn(n_rows),
        "W": np.random.choice([10, 20], size=n_rows),
        "Y": np.random.choice([0, 0, 0, 0, 0, 10, 20, 30], size=n_rows) + 0.1
    }

    train = h2o.H2OFrame(pd.DataFrame(data))
    test = train.drop("W")
    print(train)
    h2o_model = H2OGeneralizedAdditiveEstimator(family=family,
                                                gam_columns=["X3"],
                                                weights_column="W",
                                                lambda_=0,
                                                bs=[2],
                                                tweedie_variance_power=1.5,
                                                tweedie_link_power=0)
    h2o_model.train(x=["X1", "X2"], y="Y", training_frame=train)

    # force H2O to recalculate metrics instead just taking them from metrics cache
    train_clone = h2o.H2OFrame(pd.DataFrame(data))

    print("GAM performance with test_data=train: {0}, with test_data=test: {1} and train=True: "
          "{2}".format(h2o_model.model_performance(test_data=train)._metric_json["MSE"],
                       h2o_model.model_performance(test_data=test)._metric_json["MSE"],
                       h2o_model.model_performance(train=True)._metric_json["MSE"]))

    assert abs(h2o_model.model_performance(test_data=train_clone)._metric_json["MSE"] - h2o_model.model_performance(train=True)._metric_json["MSE"]) < 1e-6
def buildModelCheckPredict(train_data, test_data, model_test_data, myy, gamX,
                           family, actual_family):
    numKnots = [5, 5, 5]
    x = ["C1", "C2"]

    h2o_model = H2OGeneralizedAdditiveEstimator(family=family,
                                                gam_columns=gamX,
                                                scale=[1, 1, 1],
                                                num_knots=numKnots,
                                                standardize=True,
                                                lambda_=[0],
                                                alpha=[0],
                                                max_iterations=3,
                                                compute_p_values=False,
                                                solver="irlsm")
    h2o_model.train(x=x, y=myy, training_frame=train_data)
    pred = h2o_model.predict(test_data)
    pred_mojo = as_mojo_model(h2o_model).predict(test_data)
    if pred.ncols < model_test_data.ncols:
        ncolT = model_test_data.ncols - 1
        model_test_data = model_test_data.drop(ncolT)
    model_test_data.set_names(pred.names)
    if family == 'gaussian' or (family == 'AUTO'
                                and actual_family == 'gaussian'):
        pyunit_utils.compare_frames_local(pred, model_test_data, prob=1)
        pyunit_utils.compare_frames_local(pred_mojo, model_test_data, prob=1)
    else:
        pred = pred.drop('predict')
        pred_mojo = pred_mojo.drop('predict')
        model_test_data = model_test_data.drop('predict')
        pyunit_utils.compare_frames_local(pred, model_test_data, prob=1)
        pyunit_utils.compare_frames_local(pred_mojo, model_test_data, prob=1)
    return pred
def buildModelCoeffVarimpCheck(train_data, y, gamX, family):
    numKnots = [5, 6, 7]
    x = ["C1", "C2"]
    numPCoeffs = len(train_data["C1"].categories()) + len(
        train_data["C2"].categories()) + sum(numKnots) + 1 - len(numKnots)
    h2o_model = H2OGeneralizedAdditiveEstimator(family=family,
                                                gam_columns=gamX,
                                                scale=[1, 1, 1],
                                                num_knots=numKnots)
    h2o_model.train(x=x, y=y, training_frame=train_data)
    h2oCoeffs = h2o_model.coef()
    nclass = 1
    if family == 'multinomial':
        nclass = len(train_data[y].categories())
        h2oCoeffs = h2oCoeffs['coefficients']

    assert len(h2oCoeffs)==numPCoeffs*nclass, "expected number of coefficients: {0}, actual number of coefficients: " \
                                      "{1}".format(numPCoeffs*nclass, len(h2oCoeffs))
    h2oCoeffsStandardized = h2o_model.coef_norm()
    if family == 'multinomial':
        h2oCoeffsStandardized = h2oCoeffsStandardized[
            'standardized_coefficients']
    assert len(h2oCoeffsStandardized)==numPCoeffs*nclass, "expected number of coefficients: {0}, actual number of " \
                                                  "coefficients:{1}".format(numPCoeffs*nclass, len(h2oCoeffsStandardized))
    varimp = h2o_model.varimp()
    # exclude the intercept term here
    assert len(varimp)==(numPCoeffs-1), "expected number of coefficients: {0}, actual number of " \
                                   "coefficients:{1}".format(numPCoeffs-1, len(varimp))
示例#19
0
 def train_models(self):
     self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="gaussian",
                                                                    keep_gam_cols=True), hyper_params=self.hyper_parameters, search_criteria=self.search_criteria)
     self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
     for model in self.manual_gam_models:
         model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
     print("done")
示例#20
0
    def setup_data(self):
        """
        This function performs all initializations necessary:
        load the data sets and set the training set indices and response column index
        """
        self.h2o_data = \
            h2o.import_file(path = pyunit_utils.locate("smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv"))
        self.h2o_data['response'] = self.h2o_data['response'].asfactor()
        self.h2o_data['C3'] = self.h2o_data['C3'].asfactor()
        self.h2o_data['C7'] = self.h2o_data['C7'].asfactor()
        self.h2o_data['C8'] = self.h2o_data['C8'].asfactor()
        self.h2o_data['C10'] = self.h2o_data['C10'].asfactor()
        names = self.h2o_data.names
        self.myY = "response"
        self.myX = names.remove(self.myY)

        for lambda_ in self.hyper_parameters["lambda"]:
            for subspace in self.hyper_parameters["subspaces"]:
                for scale in subspace['scale']:
                    for gam_columns in subspace['gam_columns']:
                        for num_knots in subspace['num_knots']:
                            for bsVal in subspace['bs']:
                                self.manual_model_count += 1
                                self.manual_gam_models.append(
                                    H2OGeneralizedAdditiveEstimator(
                                        family="binomial",
                                        gam_columns=gam_columns,
                                        scale=scale,
                                        num_knots=num_knots,
                                        bs=bsVal,
                                        lambda_=lambda_,
                                        seed=1234))
 def setup_data(self):
     """
     This function performs all initializations necessary:
     load the data sets and set the training set indices and response column index
     """
     self.h2o_data = h2o.import_file(path=pyunit_utils.locate(
         "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
     ))
     self.h2o_data["C1"] = self.h2o_data["C1"].asfactor()
     self.h2o_data["C2"] = self.h2o_data["C2"].asfactor()
     self.myX = ["C1", "C2"]
     self.myY = "C11"
     self.h2o_data["C11"] = self.h2o_data["C11"].asfactor()
     for lambda_ in self.hyper_parameters["lambda"]:
         for subspace in self.hyper_parameters["subspaces"]:
             for scale in subspace['scale']:
                 for gam_columns in subspace['gam_columns']:
                     for num_knots in subspace['num_knots']:
                         self.manual_gam_models.append(
                             H2OGeneralizedAdditiveEstimator(
                                 family="multinomial",
                                 gam_columns=gam_columns,
                                 keep_gam_cols=True,
                                 scale=scale,
                                 num_knots=num_knots,
                                 lambda_=lambda_))
示例#22
0
def test_gam_gamColumns():
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
    ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    myX = ["C1", "C2"]
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial",
                                                gam_columns=["C6", "C7", "C8"],
                                                keep_gam_cols=True,
                                                scale=[1, 1, 1],
                                                num_knots=[5, 5, 5])
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    gamFrame = h2o.get_frame(
        h2o_model._model_json["output"]["gam_transformed_center_key"])
    gamFrame = gamFrame.drop("C1").drop("C2").drop("C11")
    gamFrameAns = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C6Gam_center.csv"
        ))
    gamFrameAns = gamFrameAns.cbind(
        h2o.import_file(
            pyunit_utils.locate(
                "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C7Gam_center.csv"
            )))
    gamFrameAns = gamFrameAns.cbind(
        h2o.import_file(
            pyunit_utils.locate(
                "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C8Gam_center.csv"
            )))
    pyunit_utils.compare_frames_local(gamFrameAns, gamFrame)
    print("gam gamcolumn test completed successfully")
    def setup_data(self):
        """
        This function performs all initializations necessary:
        load the data sets and set the training set indices and response column index
        """
        self.h2o_data = h2o.import_file(path=pyunit_utils.locate(
            "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
        names = self.h2o_data.names
        counter = 0
        for name in names:
            self.h2o_data[name] = self.h2o_data[name].asfactor()
            counter = counter + 1
            if counter > 9:
                break
        self.myY = "C21"
        self.myX = names.remove(self.myY)

        for lambda_ in self.hyper_parameters["lambda"]:
            for subspace in self.hyper_parameters["subspaces"]:
                for scale in subspace['scale']:
                    for gam_columns in subspace['gam_columns']:
                        for num_knots in subspace['num_knots']:
                            for bsVal in subspace['bs']:
                                self.manual_model_count += 1
                                self.manual_gam_models.append(
                                    H2OGeneralizedAdditiveEstimator(
                                        family="gaussian",
                                        gam_columns=gam_columns,
                                        scale=scale,
                                        num_knots=num_knots,
                                        bs=bsVal,
                                        lambda_=lambda_))
def test_gam_knots_key():
    print("Checking coefficients and variable importance for multinomial")
    knots1 = [-1.99905699, -0.98143075, 0.02599159, 1.00770987, 1.99942290]
    frameKnots1 = h2o.H2OFrame(python_obj=knots1)
    knots2 = [
        -1.999821861, -1.005257990, -0.006716042, 1.002197392, 1.999073589
    ]
    frameKnots2 = h2o.H2OFrame(python_obj=knots2)
    knots3 = [
        -1.999675688, -0.979893796, 0.007573327, 1.011437347, 1.999611676
    ]
    frameKnots3 = h2o.H2OFrame(python_obj=knots3)
    h2o_data = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
        ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    y = "C11"
    x = ["C1", "C2"]
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    numKnots = [5, 5, 5]
    h2o_model = H2OGeneralizedAdditiveEstimator(
        family='multinomial',
        gam_columns=["C6", "C7", "C8"],
        scale=[1, 1, 1],
        num_knots=numKnots,
        knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key])
    h2o_model.train(x=x, y=y, training_frame=h2o_data)
    h2oCoeffs = h2o_model.coef()
    h2o_model2 = H2OGeneralizedAdditiveEstimator(
        family='multinomial',
        gam_columns=["C6", "C7", "C8"],
        scale=[1, 1, 1],
        num_knots=numKnots)
    h2o_model2.train(x=x, y=y, training_frame=h2o_data)
    h2oCoeffs2 = h2o_model2.coef()

    keyNames = h2oCoeffs["coefficients"].keys()
    for kNames in keyNames:
        assert abs(
            h2oCoeffs["coefficients"][kNames] -
            h2oCoeffs2["coefficients"][kNames]
        ) < 1e-6, "expected coefficients: {0}.  actual coefficients: {1}".format(
            h2oCoeffs["coefficients"][kNames],
            h2oCoeffs2["coefficients"][kNames])
    print("gam knot keys test completed successfully")
def import_gam_mojo_regression(family):
    np.random.seed(1234)
    n_rows = 10

    data = {
        "X1": np.random.randn(n_rows),
        "X2": np.random.randn(n_rows),
        "X3": np.random.randn(n_rows),
        "W": np.random.choice([10, 20], size=n_rows),
        "Y": np.random.choice([0, 0, 0, 0, 0, 10, 20, 30], size=n_rows) + 0.1
    }

    train = h2o.H2OFrame(pd.DataFrame(data))
    test = train.drop("W")
    print(train)
    h2o_model = H2OGeneralizedAdditiveEstimator(family=family,
                                                gam_columns=["X3"],
                                                weights_column="W",
                                                lambda_=0,
                                                bs=[2],
                                                tweedie_variance_power=1.5,
                                                tweedie_link_power=0)
    h2o_model.train(x=["X1", "X2"], y="Y", training_frame=train)
    print(h2o_model)

    predict_w = h2o_model.predict(train)
    # scoring without weight column
    predict = h2o_model.predict(test) 
    
    # get train perf on a cloned frame (to avoid re-using cached metrics - force to recalculate) 
    train_clone = h2o.H2OFrame(train.as_data_frame(use_pandas=True))
    model_perf_on_train = h2o_model.model_performance(test_data=train_clone)

    # ditto on test
    test_clone = h2o.H2OFrame(test.as_data_frame(use_pandas=True))
    model_perf_on_test = h2o_model.model_performance(test_data=test_clone)

    # should produce same frame
    pyunit_utils.compare_frames_local(predict_w, predict, prob=1, tol=1e-6)

    # Save the MOJO to a temporary file
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = h2o_model.save_mojo(original_model_filename)

    # Load the model from the temporary file
    mojo_model = h2o.import_mojo(original_model_filename)

    predict_mojo_w = mojo_model.predict(train)
    predict_mojo = mojo_model.predict(test)

    # Both should produce same results as in-H2O models
    pyunit_utils.compare_frames_local(predict_mojo_w, predict, prob=1, tol=1e-6)
    pyunit_utils.compare_frames_local(predict_mojo, predict, prob=1, tol=1e-6)

    mojo_perf_on_train = mojo_model.model_performance(test_data=train_clone)
    assert abs(mojo_perf_on_train._metric_json["MSE"] - model_perf_on_train._metric_json["MSE"]) < 1e-6

    mojo_perf_on_test = mojo_model.model_performance(test_data=test_clone)
    assert abs(mojo_perf_on_test._metric_json["MSE"] - model_perf_on_test._metric_json["MSE"]) < 1e-6
def buildModelCheckPredict(train_data, myy, gamX, family, searchLambda=False, stdardize=True):
    numKnots = [5,5,5]
    x=["C1","C2"]
   
    h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX,  scale = [0.1,0.1,0.1], 
                                                num_knots=numKnots, lambda_search = searchLambda, standardize=stdardize)
    h2o_model.train(x=x, y=myy, training_frame=train_data)
    return h2o_model
def test_gam_beta_constraints():
    h2o_data = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    bc = []
    bc.append(["C1", 0.0, 0.5])
    bc.append(["C13", 0.0, 0.7])
    beta_constraints = h2o.H2OFrame(bc)
    beta_constraints.set_names(["names", "lower_bounds", "upper_bounds"])
    y = "C21"
    x = ["C1", "C2", "C13"]
    numKnots = [5, 5, 5]
    h2o_model = H2OGeneralizedAdditiveEstimator(
        family='gaussian',
        gam_columns=["C11", "C12", "C13"],
        scale=[1, 1, 1],
        num_knots=numKnots,
        bs=[2, 2, 0],
        beta_constraints=beta_constraints,
        seed=12)
    h2o_model.train(x=x, y=y, training_frame=h2o_data)
    h2oCoeffs = h2o_model.coef()
    h2o_model2 = H2OGeneralizedAdditiveEstimator(
        family='gaussian',
        gam_columns=["C11", "C12", "C13"],
        scale=[1, 1, 1],
        num_knots=numKnots,
        bs=[2, 2, 0],
        beta_constraints=beta_constraints,
        seed=12)
    h2o_model2.train(x=x, y=y, training_frame=h2o_data)
    h2oCoeffs2 = h2o_model2.coef()

    keyNames = h2oCoeffs.keys()
    for kNames in keyNames:
        assert abs(h2oCoeffs[kNames]-h2oCoeffs2[kNames]) < 1e-6, \
            "expected coefficients: {0}.  actual coefficients: {1}".format(h2oCoeffs[kNames], h2oCoeffs2[kNames])
        # check to make sure gam column coefficients are non-negative
    coef_dict = h2o_model.coef()
    coef_keys = coef_dict.keys()
    for key in coef_keys:
        if "_is_" in key:
            assert coef_dict[key] >= 0
示例#28
0
def test_gam_model_predict():
    covtype_df = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/covtype/covtype.full.csv"))
    train, valid = covtype_df.split_frame([0.9], seed=1234)

    #Prepare predictors and response columns
    covtype_X = covtype_df.col_names[:
                                     -1]  #last column is Cover_Type, our desired response variable
    covtype_y = covtype_df.col_names[-1]
    # build model with cross validation and no validation dataset
    gam_multi = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                solver='IRLSM',
                                                gam_columns=["Slope"],
                                                scale=[0.0001],
                                                num_knots=[5],
                                                standardize=True,
                                                nfolds=2,
                                                fold_assignment='modulo',
                                                alpha=[0.9, 0.5, 0.1],
                                                lambda_search=True,
                                                nlambdas=5,
                                                max_iterations=3)
    gam_multi.train(covtype_X, covtype_y, training_frame=train)
    # build model with cross validation and with validation dataset
    gam_multi_valid = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                      solver='IRLSM',
                                                      gam_columns=["Slope"],
                                                      scale=[0.0001],
                                                      num_knots=[5],
                                                      standardize=True,
                                                      nfolds=2,
                                                      fold_assignment='modulo',
                                                      alpha=[0.9, 0.5, 0.1],
                                                      lambda_search=True,
                                                      nlambdas=5,
                                                      max_iterations=3)
    gam_multi_valid.train(covtype_X,
                          covtype_y,
                          training_frame=train,
                          validation_frame=valid)
    # model should yield the same coefficients in both case
    gam_multi_coef = gam_multi.coef()
    gam_multi_valid_coef = gam_multi_valid.coef()
    pyunit_utils.assertEqualCoeffDicts(gam_multi_coef['coefficients'],
                                       gam_multi_valid_coef['coefficients'])
示例#29
0
def test_gam_effective_parameters():
    h2o_data = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    h2o_data["C21"] = h2o_data["C21"].asfactor()

    gam = H2OGeneralizedAdditiveEstimator(family='binomial',
                                          gam_columns=["C11", "C12", "C13"],
                                          scale=[1, 1, 1],
                                          num_knots=[5, 6, 7],
                                          standardize=True,
                                          Lambda=[0],
                                          alpha=[0],
                                          max_iterations=3)
    gam.train(x=["C1", "C2"], y="C21", training_frame=h2o_data)

    assert gam.parms['solver']['input_value'] == 'AUTO'
    assert gam.parms['solver']['actual_value'] == "IRLSM"
    assert gam.parms['fold_assignment']['input_value'] == 'AUTO'
    assert gam.parms['fold_assignment']['actual_value'] is None

    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.algos.evaluate_auto_model_parameters", "false"))
        gam = H2OGeneralizedAdditiveEstimator(
            family='binomial',
            gam_columns=["C11", "C12", "C13"],
            scale=[1, 1, 1],
            num_knots=[5, 6, 7],
            standardize=True,
            Lambda=[0],
            alpha=[0],
            max_iterations=3)
        gam.train(x=["C1", "C2"], y="C21", training_frame=h2o_data)

        assert gam.parms['solver']['input_value'] == 'AUTO'
        assert gam.parms['solver']['actual_value'] == 'AUTO'
        assert gam.parms['fold_assignment']['input_value'] == 'AUTO'
        assert gam.parms['fold_assignment']['actual_value'] == 'AUTO'
    finally:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))
def test_gam_model_predict():
    print("Checking model scoring for gaussian")
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    myY = "C21"
    model_test_data = h2o.import_file(
        pyunit_utils.locate("smalldata/gam_test/predictGaussianGAM1.csv"))
    buildModelCheckPredict(h2o_data, h2o_data, model_test_data, myY,
                           ["C11", "C12", "C13"], 'gaussian')

    print("Checking model scoring for multinomial")
    h2o_data = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
        ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    model_test_data = h2o.import_file(
        pyunit_utils.locate("smalldata/gam_test/predictMultinomialGAM1.csv"))
    buildModelCheckPredict(h2o_data, h2o_data, model_test_data, myY,
                           ["C6", "C7", "C8"], 'multinomial')

    print("Checking model scoring for binomial")
    h2o_data = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    myY = "C21"
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    model_test_data = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/gam_test/predictBinomialGAMRPython.csv"))
    buildModelCheckPredict(h2o_data, h2o_data, model_test_data, myY,
                           ["C11", "C12", "C13"], 'binomial')
    print("gam coeff/varimp test completed successfully")

    # add fractional binomial just to make sure it runs
    print("Checking model scoring for fractionalbinomial")
    h2o_data = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    h2o_model = H2OGeneralizedAdditiveEstimator(
        family="fractionalbinomial",
        gam_columns=["C11", "C12", "C13"],
        scale=[1, 1, 1],
        num_knots=[5, 5, 5],
        standardize=True,
        solver="irlsm")
    h2o_model.train(x=["C1", "C2"], y="C21", training_frame=h2o_data)
    predictTest = h2o_model.predict(h2o_data)