def test_infogram_iris_x_attributes():
    """
    Test to showcase that we can specify predictors using infogram model
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/irisROriginal.csv"))
    target = "Species"
    fr[target] = fr[target].asfactor()
    x = fr.names
    x.remove(target)

    infogram_model = H2OInfogram(
        seed=12345, distribution='multinomial'
    )  # build infogram model with default settings
    infogram_model.train(x=x, y=target, training_frame=fr)

    glm_model1 = H2OGeneralizedLinearEstimator(family='multinomial')
    glm_model1.train(x=infogram_model._extract_x_from_model(),
                     y=target,
                     training_frame=fr)
    coef1 = glm_model1.coef()
    glm_model2 = H2OGeneralizedLinearEstimator(family='multinomial')
    glm_model2.train(x=infogram_model, y=target, training_frame=fr)
    coef2 = glm_model2.coef()
    coef_classes = coef1.keys()
    for key in coef_classes:
        pyunit_utils.assertCoefDictEqual(coef1[key], coef2[key], tol=1e-6)
def test_infogram_personal_loan():
    """
    Test to make sure predictor can be specified using infogram model. 
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = [
        "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage",
        "Securities Account", "CD Account", "Online", "CreditCard"
    ]
    infogram_model = H2OInfogram(seed=12345,
                                 protected_columns=["Age", "ZIP Code"])
    infogram_model.train(x=x, y=target, training_frame=fr)

    glm_model1 = H2OGeneralizedLinearEstimator()
    glm_model1.train(x=infogram_model._extract_x_from_model(),
                     y=target,
                     training_frame=fr)
    coef1 = glm_model1.coef()
    glm_model2 = H2OGeneralizedLinearEstimator()
    glm_model2.train(x=infogram_model, y=target, training_frame=fr)
    coef2 = glm_model2.coef()

    pyunit_utils.assertCoefDictEqual(coef1, coef2, tol=1e-6)
def test_maxrglm_gaussian_coefs():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    maxrglm_model = maxrglm(seed=12345, max_predictor_number=7)
    maxrglm_model.train(training_frame=d, x=my_x, y=my_y)
    coefs = maxrglm_model.coef()
    coefs_norm = maxrglm_model.coef_norm()
    for ind in list(range(len(coefs))):
        one_coef = coefs[ind]
        one_coef_norm = coefs_norm[ind]
        # coefficients obtained from accessing model_id, generate model and access the model coeffs
        one_model = h2o.get_model(
            maxrglm_model._model_json["output"]["best_model_ids"][ind]['name'])
        model_coef = one_model.coef()
        model_coef_norm = one_model.coef_norm()
        # get coefficients of individual predictor subset size
        subset_size = ind + 1
        one_model_coef = maxrglm_model.coef(subset_size)
        one_model_coef_norm = maxrglm_model.coef_norm(subset_size)

        # check coefficient dicts are equal
        pyunit_utils.assertCoefDictEqual(one_coef, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_coef_norm, model_coef_norm, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm,
                                         1e-6)
def test_GLM_RCC_warning():
    warnNumber = 1
    hdf = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))

    print("Testing for family: TWEEDIE")
    print("Set variables for h2o.")
    y = "CAPSULE"
    x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"]

    print("Create models with canonical link: TWEEDIE")
    buffer = StringIO()  # redirect output
    sys.stderr = buffer
    model_h2o_tweedie = H2OGeneralizedLinearEstimator(
        family="tweedie",
        link="tweedie",
        alpha=0.5,
        Lambda=0.1,
        remove_collinear_columns=True,
        solver="coordinate_descent")
    model_h2o_tweedie.train(
        x=x, y=y, training_frame=hdf)  # this should generate a warning message
    model_h2o_tweedie_wo_rcc = H2OGeneralizedLinearEstimator(
        family="tweedie",
        link="tweedie",
        alpha=0.5,
        Lambda=0.1,
        solver="coordinate_descent")
    sys.stderr = sys.__stderr__  # redirect printout back to normal path
    model_h2o_tweedie_wo_rcc.train(
        x=x, y=y, training_frame=hdf)  # no warning message here.

    # since remove_collinear_columns have no effect, this two models should be the same
    pyunit_utils.assertCoefDictEqual(model_h2o_tweedie.coef(),
                                     model_h2o_tweedie_wo_rcc.coef())

    # check and make sure we get the correct warning message
    warn_phrase = "remove_collinear_columns only works when IRLSM"
    try:  # for python 2.7
        assert len(buffer.buflist) == warnNumber
        print(buffer.buflist[0])
        assert warn_phrase in buffer.buflist[0]
    except:  # for python 3.
        warns = buffer.getvalue()
        print("*** captured warning message: {0}".format(warns))
        assert warn_phrase in warns
Пример #5
0
    def match_models(self):
        for model in self.manual_gam_models:
            alpha = model.actual_params['alpha']
            lambda_ = model.actual_params['lambda']
            scale = model.actual_params['scale']
            num_knots = model.actual_params['num_knots']
            for grid_search_model in self.h2o_model.models:
                if grid_search_model.actual_params['alpha'] == alpha \
                    and grid_search_model.actual_params['lambda'] == lambda_\
                    and grid_search_model.actual_params['scale'] == scale\
                    and grid_search_model.actual_params['num_knots'] == num_knots:
                    self.num_grid_models += 1
                    pyunit_utils.assertCoefDictEqual(grid_search_model.coef(),
                                                     model.coef())
                    break

        assert self.num_grid_models == self.num_expected_models, "Grid search model parameters incorrect or " \
                                                                 "incorrect number of models generated"
def test_glm_beta_constraints_dict_megan():
    df = h2o.import_file(pyunit_utils.locate("smalldata/kaggle/CreditCard/creditcard_train_cat.csv"),
                         col_types={"DEFAULT_PAYMENT_NEXT_MONTH": "enum"})
    lb_limit_bal = 0.0001
    constraints = h2o.H2OFrame({'names':["LIMIT_BAL", "AGE"], 'lower_bounds': [lb_limit_bal, lb_limit_bal], 
                                'upper_bounds': [1e6, 1e6]})
    # make sure we have the column names in expected order, the backend does weird things when the order is different    
    constraints = constraints[["names", "lower_bounds", "upper_bounds"]]
    glm_beta = H2OGeneralizedLinearEstimator(model_id="beta_glm", beta_constraints=constraints, seed=42)
    glm_beta.train(y="DEFAULT_PAYMENT_NEXT_MONTH", training_frame=df)
    glm_coeff = glm_beta.coef()
    assert glm_coeff["LIMIT_BAL"] >= lb_limit_bal or glm_coeff["LIMIT_BAL"]==0
    # using dict for beta_constraints
    constraints2 = {"LIMIT_BAL":{"lower_bound":lb_limit_bal, "upper_bound":1e6}, "AGE":{"lower_bound":lb_limit_bal,
                                                                                        "upper_bound":1e6}}
    glm_beta_dict = H2OGeneralizedLinearEstimator(model_id="beta_glm", beta_constraints=constraints2, seed=42)
    glm_beta_dict.train(y="DEFAULT_PAYMENT_NEXT_MONTH", training_frame=df)
    glm_coeff_dict = glm_beta_dict.coef()
    pyunit_utils.assertCoefDictEqual(glm_coeff, glm_coeff_dict, tol=1e-6) # coefficients should be the same from both runs
    print("test complete!")
Пример #7
0
def test_gam_dual_mode_multinomial():
    train = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
        ))
    train["C11"] = train["C11"].asfactor()
    train["C1"] = train["C1"].asfactor()
    train["C2"] = train["C2"].asfactor()
    test = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
        ))
    test["C11"] = test["C11"].asfactor()
    test["C1"] = test["C1"].asfactor()
    test["C2"] = test["C2"].asfactor()
    x = ["C1", "C2"]
    y = "C11"
    gam_cols1 = ["C6", ["C7", "C8"], "C9", "C10"]
    gam_cols2 = [["C6"], ["C7", "C8"], ["C9"], ["C10"]]
    h2o_model1 = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                 gam_columns=gam_cols1,
                                                 bs=[1, 1, 0, 0],
                                                 max_iterations=2)
    h2o_model1.train(x=x, y=y, training_frame=train, validation_frame=test)
    h2o_model2 = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                 gam_columns=gam_cols2,
                                                 bs=[1, 1, 0, 0],
                                                 max_iterations=2)
    h2o_model2.train(x=x, y=y, training_frame=train, validation_frame=test)
    # check that both models produce the same coefficients
    print(h2o_model1.coef())
    print(h2o_model2.coef())
    pyunit_utils.assertCoefDictEqual(h2o_model1.coef()['coefficients'],
                                     h2o_model2.coef()['coefficients'],
                                     tol=1e-6)
    # check both models product the same validation metrics
    assert abs(h2o_model1.logloss(valid=True) - h2o_model2.logloss(valid=True)) < 1e-6,\
        "Expected validation logloss: {0}, Actual validation logloss: {1}".format(h2o_model1.logloss(valid=True),
                                                                                  h2o_model2.logloss(valid=True))
Пример #8
0
def test_gaussian_alpha():
    col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_rmse", "validation_rmse",
                        "training_mae", "validation_mae", "training_deviance", "validation_deviance"]

    h2o_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]

    # test with lambda search on, generate_scoring_history on and off
    model1 = glm(family="gaussian", lambda_search=True, alpha=[0,0.2,1], generate_scoring_history=True, nlambdas=5)
    model1.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    model2 = glm(family="gaussian", lambda_search=True, alpha=[0,0.2,1], generate_scoring_history=False, nlambdas=5)
    model2.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())

    # test with lambda search off, generate_scoring_history on and off
    model1 = glm(family="gaussian", lambda_search=False, alpha=[0,0.8,1], generate_scoring_history=True, 
                 Lambda=[0,0.004])
    model1.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    model2 = glm(family="gaussian", lambda_search=False, alpha=[0,0.8,1], generate_scoring_history=False,
                 Lambda=[0,0.004])
    model2.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())

    # test with lambda search on, generate_scoring_history on and off, cv on
    model1 = glm(family="gaussian", lambda_search=True, alpha=[0,0.8,1], generate_scoring_history=True, 
                 nfolds=2, seed=12345, nlambdas=5)
    model1.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    model2 = glm(family="gaussian", lambda_search=True, alpha=[0,0.8,1], generate_scoring_history=False,
                 nfolds=2, seed=12345, nlambdas=5)
    model2.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())

    # test with lambda search off, generate_scoring_history on and off, cv on
    model1 = glm(family="gaussian", lambda_search=False, alpha=[0,0.2,1], generate_scoring_history=True,
                 Lambda=[0,0.1], nfolds=2, seed=12345)
    model1.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    model2 = glm(family="gaussian", lambda_search=False, alpha=[0,0.2], generate_scoring_history=False,
                 Lambda=[0,0.1], nfolds=2, seed=12345)
    model2.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())
Пример #9
0
def buildModelCheckCoeff(train_data, y, gamX, family):
    numKnots = [3,4,5]
    scale= [0.001, 0.001, 0.001]
    bs_type = [0,0,0]
    x=["C1","C2"]
    frames = train_data.split_frame(ratios=[0.9])
    train_part = frames[0]
    test_part = frames[1]
    # building multiple models with same training / test datasets to make sure it works
    h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=scale, bs=bs_type, 
                                                num_knots=numKnots)
    h2o_model.train(x=x, y=y, training_frame=train_part, validation_frame=test_part)

    h2o_model2 = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=scale,
                                                bs=bs_type, num_knots=numKnots)
    h2o_model2.train(x=x, y=y, training_frame=train_part, validation_frame=test_part)
    coef1 = h2o_model.coef()
    coef2 = h2o_model2.coef()
    if family=='multinomial':
        allKeys = coef1.keys()
        for oneKey in allKeys:
            pyunit_utils.assertCoefDictEqual(coef1[oneKey], coef2[oneKey])
    else:
        pyunit_utils.assertCoefDictEqual(coef1, coef2)
def test_binomial_alpha():
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
    Y = 3
    X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]

    # test with lambda search on, generate_scoring_history on and off
    model1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=True, 
                                           generate_scoring_history=True)
    model1.train(x=X, y=Y, training_frame=training_data)
    model2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=True,
                                           generate_scoring_history=True)
    model2.train(x=X, y=Y, training_frame=training_data)
    pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())

    # test with lambda search off, generate_scoring_history on and off
    model1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=False,
                                           generate_scoring_history=True, Lambda=[0, 0.1, 0.01, 0.001])
    model1.train(x=X, y=Y, training_frame=training_data)
    model2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=False,
                                           generate_scoring_history=True, Lambda=[0, 0.1, 0.01, 0.001])
    model2.train(x=X, y=Y, training_frame=training_data)
    pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())

    # test with lambda search on, generate_scoring_history on and off, cv on
    model1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=True,
                                           generate_scoring_history=True, nfolds=2, seed=12345)
    model1.train(x=X, y=Y, training_frame=training_data)
    model2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=True,
                                           generate_scoring_history=True, nfolds=2, seed=12345)
    model2.train(x=X, y=Y, training_frame=training_data)
    pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())

    # test with lambda search off, generate_scoring_history on and off, cv on
    model1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=False,
                                           generate_scoring_history=True, nfolds=2, seed=12345, 
                                           Lambda=[0, 0.1, 0.01, 0.001])
    model1.train(x=X, y=Y, training_frame=training_data)
    model2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0,0.2,0.5,0.8,1], lambda_search=False,
                                           generate_scoring_history=True, nfolds=2, seed=12345, 
                                           Lambda=[0, 0.1, 0.01, 0.001])
    model2.train(x=X, y=Y, training_frame=training_data)
    pyunit_utils.assertCoefDictEqual(model1.coef(), model2.coef())
def test_modelselection_gaussian_coefs():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=7,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    coefs_allsubsets = allsubsets_model.coef()
    coefs_norm_allsubsets = allsubsets_model.coef_norm()
    maxrsweep_model = modelSelection(seed=12345,
                                     max_predictor_number=7,
                                     mode="maxrsweep")
    maxrsweep_model.train(training_frame=d, x=my_x, y=my_y)
    maxr_model = modelSelection(seed=12345,
                                max_predictor_number=7,
                                mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    # make sure results returned by maxr and maxrsweep are the same
    pyunit_utils.compare_frames_local(maxr_model.result()[2:4],
                                      maxrsweep_model.result()[2:4],
                                      prob=1.0,
                                      tol=1e-6)
    coefs_maxr = maxr_model.coef()
    coefs_norm_maxr = maxr_model.coef_norm()

    for ind in list(range(len(coefs_allsubsets))):
        one_coef_allsubsets = coefs_allsubsets[ind]
        one_coef_norm_allsubsets = coefs_norm_allsubsets[ind]
        one_coef_maxr = coefs_maxr[ind]
        one_coef_norm_maxr = coefs_norm_maxr[ind]
        # coefficients obtained from accessing model_id, generate model and access the model coeffs
        one_model = h2o.get_model(allsubsets_model._model_json["output"]
                                  ["best_model_ids"][ind]['name'])
        model_coef = one_model.coef()
        model_coef_norm = one_model.coef_norm()
        # get coefficients of individual predictor subset size
        subset_size = ind + 1
        one_model_coef = allsubsets_model.coef(subset_size)
        one_model_coef_norm = allsubsets_model.coef_norm(subset_size)

        # check coefficient dicts are equal
        pyunit_utils.assertCoefDictEqual(one_coef_allsubsets, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_coef_norm_allsubsets,
                                         model_coef_norm, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm,
                                         1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef, one_coef_maxr, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef_norm,
                                         one_coef_norm_maxr, 1e-6)
def test_gam_cv_fold_columns():
    # create frame knots
    knots1 = [-1.99905699, -0.98143075, 0.02599159, 1.00770987, 1.99942290]
    frameKnots1 = h2o.H2OFrame(python_obj=knots1)
    knots2 = [-1.999821861, -1.005257990, -0.006716042, 1.002197392, 1.999073589]
    frameKnots2 = h2o.H2OFrame(python_obj=knots2)
    knots3 = [-1.999675688, -0.979893796, 0.007573327, 1.011437347, 1.999611676]
    frameKnots3 = h2o.H2OFrame(python_obj=knots3)

    # import the dataset
    h2o_data = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"))
    # convert the C1, C2, and C11 columns to factors
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    h2o_data["C11"] = h2o_data["C11"].asfactor()

    # split into train and validation sets
    train, test = h2o_data.split_frame(ratios=[.8])

    # set the predictor and response columns
    y = "C11"
    x = ["C1", "C2"]

    # specify the knots array
    numKnots = [5, 5, 5]

    # Both of these gives an NPE, should be fixed now.

    # build the GAM model gam_columns=["C6","C7","C8"]
    h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                gam_columns=["C6", "C7", "C8"],
                                                scale=[0, 1, 2],
                                                num_knots=numKnots,
                                                knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key],
                                                nfolds=5,
                                                seed=1234,
                                                fold_assignment='modulo')

    h2o_model.train(x=x, y=y, training_frame=train)

    # create a fold column for train
    fold_numbers = train.kfold_column(n_folds=5, seed=1234)
    # rename the column "fold_numbers"
    fold_numbers.set_names(["fold_numbers"])
    train = train.cbind(fold_numbers)

    # build the GAM model
    h2o_model_fold_column = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                            gam_columns=["C6", "C7", "C8"],
                                                            scale=[0, 1, 2],
                                                            num_knots=numKnots,
                                                            knot_ids=[frameKnots1.key, frameKnots2.key,
                                                                      frameKnots3.key])

    h2o_model_fold_column.train(x=x, y=y, training_frame=train, fold_column="fold_numbers")

    # both model should return the same coefficients since they use the same fold assignment
    coeff = h2o_model.coef()
    coeff_fold_column = h2o_model_fold_column.coef()
    pyunit_utils.assertCoefDictEqual(coeff['coefficients'], coeff_fold_column['coefficients'])