def run_print_model_performance(family, train, nfolds, bc_constraints, x, y,
                                printText, seed, solver):
    print(printText)
    if bc_constraints is None:
        print("No beta constraints: Without lambda search and with solver {0}".
              format(solver))
        h2o_model = H2OGeneralizedLinearEstimator(
            family=family,
            nfolds=nfolds,
            beta_constraints=bc_constraints,
            seed=seed,
            solver=solver)
        h2o_model.train(x=x, y=y, training_frame=train)
        print(h2o_model.model_performance(xval=True))
        print("No beta constraints: With lambda search and with solver {0}".
              format(solver))
        h2o_model2 = H2OGeneralizedLinearEstimator(
            family=family,
            nfolds=nfolds,
            beta_constraints=bc_constraints,
            seed=seed,
            lambda_search=True,
            solver=solver)
        h2o_model2.train(x=x, y=y, training_frame=train)
        print(h2o_model.model_performance(xval=True))
    else:
        print("Without lambda search and with solver {0}".format(solver))
        h2o_model = H2OGeneralizedLinearEstimator(
            family=family,
            nfolds=nfolds,
            beta_constraints=bc_constraints,
            seed=seed,
            solver=solver)
        h2o_model.train(x=x, y=y, training_frame=train)
        print(h2o_model.model_performance(xval=True))
        print("With lambda search and with solver {0}".format(solver))
        h2o_model2 = H2OGeneralizedLinearEstimator(
            family=family,
            nfolds=nfolds,
            beta_constraints=bc_constraints,
            seed=seed,
            lambda_search=True,
            solver=solver)
        h2o_model2.train(x=x, y=y, training_frame=train)
        print(h2o_model.model_performance(xval=True))
        coeff = h2o_model.coef()
        coeff2 = h2o_model2.coef()
        colNames = bc_constraints["names"]
        lowerB = bc_constraints["lower_bounds"]
        upperB = bc_constraints["upper_bounds"]
        for count in range(0, len(colNames)):
            assert (coeff[colNames[count,0]] >= lowerB[count,0] and coeff[colNames[count,0]] <= upperB[count,0]) or \
                   coeff[colNames[count,0]]==0,\
                "coefficient exceed limits"
            assert (coeff2[colNames[count,0]] >= lowerB[count,0] and coeff2[colNames[count,0]] <= upperB[count,0]) or\
                   coeff2[colNames[count,0]]==0, \
                "coefficient exceed limits"
def algo_pr_auc_test():
    '''
    This pyunit test is written to make sure we can call pr_auc() on all binomial models.
    '''

    seed = 123456789
    prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    # Build H2O GBM classification model:
    gbm_h2o = H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.1, max_depth=4, min_rows=10,
                                           distribution="bernoulli", seed=seed)
    gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing GBM model")
    print(gbm_h2o)
    print("pr_auc for GBM model is {0}".format(gbm_h2o.pr_auc()))

    # Build H2O GLM classification model:
    glm_h2o = H2OGeneralizedLinearEstimator(family='binomial', seed=seed)
    glm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing GLM model")
    print(glm_h2o)  # glm scoring history does not contain AUC, and hence no pr_auc
    print("pr_auc for GLM model is {0}".format(glm_h2o.pr_auc()))
    
    rf_h2o = H2ORandomForestEstimator(ntrees=10, score_tree_interval=0)
    rf_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing random forest model")
    print(rf_h2o)
    print("pr_auc for Random Forest model is {0}".format(rf_h2o.pr_auc()))

    dl_h2o = H2ODeepLearningEstimator(distribution='bernoulli', seed=seed, hidden=[2,2])
    dl_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing deeplearning model")
    print(dl_h2o)
    print("pr_auc for deeplearning model is {0}".format(dl_h2o.pr_auc()))

    assert abs(gbm_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \
        "problem with pr_auc values"

    assert abs(rf_h2o.pr_auc()-dl_h2o.pr_auc()) < 0.9, \
        "problem with pr_auc values"

    assert abs(rf_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \
        "problem with pr_auc values"

    # try to call pr_auc() for regression.  Should encounter error.
    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))
    myY = "GLEASON"
    myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]
    h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0)
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    try:
        print(h2o_model.pr_auc())
        assert 1==2, "pr_auc() should raise an error for multinomial but did not."
    except:
        pass
def test_GLM_RCC_warning():
    warnNumber = 1
    hdf = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))

    print("Testing for family: TWEEDIE")
    print("Set variables for h2o.")
    y = "CAPSULE"
    x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"]

    print("Create models with lambda_search")
    buffer = StringIO()  # redirect output
    sys.stderr = buffer
    model_h2o_tweedie = H2OGeneralizedLinearEstimator(
        family="tweedie",
        link="tweedie",
        lambda_search=True,
        remove_collinear_columns=True,
        solver="irlsm")
    model_h2o_tweedie.train(
        x=x, y=y, training_frame=hdf)  # this should generate a warning message
    sys.stderr = sys.__stderr__  # redirect printout back to normal path

    # check and make sure we get the correct warning message
    warn_phrase = "It is used improperly here with lambda_search"
    try:  # for python 2.7
        assert len(buffer.buflist) == warnNumber
        print(buffer.buflist[0])
        assert warn_phrase in buffer.buflist[0]
    except:  # for python 3.
        warns = buffer.getvalue()
        print("*** captured warning message: {0}".format(warns))
        assert warn_phrase in warns

    print("Create models with non-zero lambda")
    buffer = StringIO()  # redirect output
    sys.stderr = buffer
    model_h2o_tweedie = H2OGeneralizedLinearEstimator(
        family="tweedie",
        link="tweedie",
        Lambda=0.01,
        remove_collinear_columns=True,
        solver="irlsm")
    model_h2o_tweedie.train(
        x=x, y=y, training_frame=hdf)  # this should generate a warning message
    sys.stderr = sys.__stderr__  # redirect printout back to normal path
    # check and make sure we get the correct warning message
    warn_phrase = "It is used improperly here.  Please set lambda=0"
    try:  # for python 2.7
        assert len(buffer.buflist) == warnNumber
        print(buffer.buflist[0])
        assert warn_phrase in buffer.buflist[0]
    except:  # for python 3.
        warns = buffer.getvalue()
        print("*** captured warning message: {0}".format(warns))
        assert warn_phrase in warns
def run_print_model_performance(family, train, nfolds, bc_constraints, x, y,
                                printText, seed, solver):
    print(printText)
    if bc_constraints is None:
        print("Without lambda search, solver = {0}".format(solver))
        h2o_model = H2OGeneralizedLinearEstimator(family=family,
                                                  nfolds=nfolds,
                                                  seed=seed,
                                                  solver=solver)
        h2o_model.train(x=x, y=y, training_frame=train)
        print(h2o_model.model_performance(xval=True))
        print("With lambda search, solver = {0}".format(solver))
        h2o_model2 = H2OGeneralizedLinearEstimator(family=family,
                                                   nfolds=nfolds,
                                                   seed=seed,
                                                   lambda_search=True,
                                                   solver=solver)
        h2o_model2.train(x=x, y=y, training_frame=train)
        print(h2o_model2.model_performance(xval=True))
    else:
        print("Without lambda search, solver = {0}".format(solver))
        h2o_model = H2OGeneralizedLinearEstimator(
            family=family,
            nfolds=nfolds,
            beta_constraints=bc_constraints,
            seed=seed,
            solver=solver)
        h2o_model.train(x=x, y=y, training_frame=train)
        print(h2o_model.model_performance(xval=True))
        print("With lambda search, solver = {0}".format(solver))
        h2o_model2 = H2OGeneralizedLinearEstimator(
            family=family,
            nfolds=nfolds,
            beta_constraints=bc_constraints,
            seed=seed,
            lambda_search=True,
            solver=solver)
        h2o_model2.train(x=x, y=y, training_frame=train)
        print(h2o_model2.model_performance(xval=True))
        coeff = h2o_model.coef()
        coeff2 = h2o_model2.coef()
        colNames = bc_constraints["names"]
        lowerB = bc_constraints["lower_bounds"]
        upperB = bc_constraints["upper_bounds"]
        for count in range(0, len(colNames)):
            assert (coeff[colNames[count, 0]] >= lowerB[count, 0] and
                    (coeff[colNames[count, 0]] < upperB[count, 0] or (
                            coeff[colNames[count, 0]] - upperB[count, 0]) < 1e-6)) \
                   or coeff[colNames[count, 0]] == 0, "coeff: {0}, lower limit: {1}, upper limit: " \
                                                      "{2}".format(coeff[colNames[count, 0]], lowerB[count, 0], upperB[count, 0])
            assert (coeff2[colNames[count, 0]] >= lowerB[count, 0] and
                    (coeff2[colNames[count, 0]] < upperB[count, 0] or (
                    coeff2[colNames[count, 0]] - upperB[count, 0]) < 1e-6)) or coeff2[colNames[count, 0]] == 0, \
                "coeff: {0}, lower limit: {1}, upper limit: " \
                                                         "{2}".format(coeff2[colNames[count, 0]], lowerB[count, 0], upperB[count, 0])
示例#5
0
def link_functions_poisson():
    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")
        ).open("prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 9]
    sm_data_features = sm_data[:, 1:9]

    print("Testing for family: POISSON")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    print("Create h2o model with canonical link: LOG")
    h2o_model_log = H2OGeneralizedLinearEstimator(family="poisson",
                                                  link="log",
                                                  alpha=0.5,
                                                  Lambda=0)
    h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data)

    print("Create statsmodel model with canonical link: LOG")
    sm_model_log = sm.GLM(endog=sm_data_response,
                          exog=sm_data_features,
                          family=sm.families.Poisson(
                              sm.families.links.log)).fit()

    print("Compare model deviances for link function log")
    h2o_deviance_log = old_div(h2o_model_log.residual_deviance(),
                               h2o_model_log.null_deviance())
    sm_deviance_log = old_div(sm_model_log.deviance,
                              sm_model_log.null_deviance)
    assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"

    print("Create h2o models with link: IDENTITY")
    h2o_model_id = H2OGeneralizedLinearEstimator(family="poisson",
                                                 link="identity",
                                                 alpha=0.5,
                                                 Lambda=0)
    h2o_model_id.train(x=myX, y=myY, training_frame=h2o_data)

    print("Create statsmodel models with link: IDENTITY")
    sm_model_id = sm.GLM(endog=sm_data_response,
                         exog=sm_data_features,
                         family=sm.families.Poisson(
                             sm.families.links.identity)).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance_id = old_div(h2o_model_id.residual_deviance(),
                              h2o_model_id.null_deviance())
    sm_deviance_id = old_div(sm_model_id.deviance, sm_model_id.null_deviance)
    assert h2o_deviance_id - sm_deviance_id < 0.01, "expected h2o to have an equivalent or better deviance measures"
示例#6
0
def interactions_GLM_Binomial():
    # test multiple interactions_GLM_Binomial enum by enum, enum by num and num by num all with NA terms
    print("******* Test interaction pairs")
    pd_df_NA = pd.DataFrame(np.array([[1,0,1,0,1,0], [1,2,4.2/2.2,4,3,1], [2,3,float('NaN'),1,2,3],
                                      ["a","a","a","b","a","b"], ['Foo','UNKNOWN','Foo','Foo','Foo','Bar']]).T,
                            columns=['label','numerical_feat','numerical_feat2','categorical_feat',
                                     'categorical_feat2'])
    h2o_df_NA = h2o.H2OFrame(pd_df_NA, na_strings=["UNKNOWN"])
    pd_df = pd.DataFrame(np.array([[1,0,1,0,1,0], [1,2,4.2/2.2,4,3,1], [2,3,2.2,1,2,3],
                                   ["a","a","a","b","a","b"], ['Foo','Foo','Foo','Foo','Foo','Bar']]).T,
                         columns=['label','numerical_feat','numerical_feat2','categorical_feat',
                                  'categorical_feat2'])
    h2o_df = h2o.H2OFrame(pd_df, na_strings=["UNKNOWN"])
    
    interaction_pairs = [("numerical_feat", "numerical_feat2"),("numerical_feat", "categorical_feat2"),
                         ("categorical_feat", "categorical_feat2")]
    xcols = ['numerical_feat','numerical_feat2','categorical_feat','categorical_feat2']
    
    # build model with and without NA in Frame
    modelNA = H2OGeneralizedLinearEstimator(family = "Binomial", alpha=0, lambda_search=False,
                                            interaction_pairs=interaction_pairs, standardize=False)
    modelNA.train(x=xcols, y='label', training_frame=h2o_df_NA)
    # build model with and without NA in Frame
    model = H2OGeneralizedLinearEstimator(family = "Binomial", alpha=0, lambda_search=False,
                                          interaction_pairs=interaction_pairs, standardize=False)
    model.train(x=xcols, y='label', training_frame=h2o_df)
    assert_arrays_equal_NA(modelNA._model_json['output']['coefficients_table'].cell_values,
                           model._model_json['output']['coefficients_table'].cell_values)

    # test interaction of num and num columns
    print("******* Test interaction with num by num")
    pd_df_num_num_NA = pd.DataFrame(np.array([[1,0,1,0], [1,2,2,4], [2, 3, float('NaN'), 1]]).T,
                                    columns=['label', 'numerical_feat', 'numerical_feat2'])
    pd_df_num_num = pd.DataFrame(np.array([[1,0,1,0], [1,2,2,4], [2, 3, 2, 1]]).T,
                                 columns=['label', 'numerical_feat', 'numerical_feat2'])
    performOneTest(pd_df_num_num_NA, pd_df_num_num, interactionColumn= ['numerical_feat', 'numerical_feat2'],
                   xcols=['numerical_feat', 'numerical_feat2'], standard=False)
    
    # test interaction of enum and enum columns
    print("******* Test interaction with enum by enum")
    pd_df_cat_cat_NA = pd.DataFrame(np.array([[1,0,1,0], ["a", "a", "b", "b"], ['Foo', 'UNKNOWN', 'Foo', 'Bar']]).T,
                                    columns=['label', 'categorical_feat', 'categorical_feat2'])
    pd_df_cat_cat = pd.DataFrame(np.array([[1,0,1,0], ["a", "a", "b", "b"], ['Foo', 'Foo', 'Foo', 'Bar']]).T,
                                 columns=['label', 'categorical_feat', 'categorical_feat2'])
    performOneTest(pd_df_cat_cat_NA, pd_df_cat_cat, interactionColumn= ['categorical_feat', 'categorical_feat2'],
                   xcols=['categorical_feat', 'categorical_feat2'])
    
    # test interaction of enum and num columns
    print("******* Test interaction with enum by num")
    pd_df_cat_num_NA = pd.DataFrame(np.array([[1,0,1,0], [1,2,3,4], ['Foo', 'UNKNOWN', 'Foo', 'Bar']]).T,
                                    columns=['label', 'numerical_feat', 'categorical_feat'])
    pd_df_cat_num = pd.DataFrame(np.array([[1,0,1,0], [1,2,3,4], ['Foo', 'Foo', 'Foo', 'Bar']]).T,
                                 columns=['label', 'numerical_feat', 'categorical_feat'])
    performOneTest(pd_df_cat_num_NA, pd_df_cat_num, interactionColumn= ['numerical_feat', 'categorical_feat'],
                   xcols=['numerical_feat', 'categorical_feat'])
def link_functions_negbinomial():

    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")
        ).open("prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 9]
    sm_data_features = sm_data[:, 1:9]

    print("Testing for family: Negative Binomial")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    thetas = [0.000000001, 0.01, 0.1, 0.5, 1]
    for thetaO in thetas:
        print("Create statsmodel model with canonical link: LOG")
        sm_model_log = sm.GLM(endog=sm_data_response,
                              exog=sm_data_features,
                              family=sm.families.NegativeBinomial(
                                  sm.families.links.identity, thetaO)).fit()
        print("Create h2o model with canonical link: LOG")
        h2o_model_log = H2OGeneralizedLinearEstimator(
            family="negativebinomial",
            link="log",
            alpha=0.5,
            Lambda=0,
            theta=thetaO)
        h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data)
        print(
            "Comparing H2O model and Python model with log link and theta={0}".
            format(thetaO))
        compareModels(h2o_model_log, sm_model_log)

        print("Create statsmodel model with canonical link: identity")
        sm_model_identity = sm.GLM(endog=sm_data_response,
                                   exog=sm_data_features,
                                   family=sm.families.NegativeBinomial(
                                       sm.families.links.log, thetaO)).fit()
        print("Create h2o model with canonical link: identity")
        h2o_model_identity = H2OGeneralizedLinearEstimator(
            family="negativebinomial",
            link="identity",
            alpha=0.5,
            Lambda=0,
            theta=thetaO)
        h2o_model_identity.train(x=myX, y=myY, training_frame=h2o_data)
        print(
            "Comparing H2O model and Python model with identity link and theta = "
            .format(thetaO))
        compareModels(h2o_model_identity, sm_model_identity)
def test_relevel():
    #First, compare againts itself
    print("Importing prostate_cat.csv data...\n")
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"),
        na_strings=["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"])

    mh2o1 = H2OGeneralizedLinearEstimator(family="binomial",
                                          Lambda=0,
                                          missing_values_handling="Skip")
    mh2o1.train(x=list(range(1, d.ncol)), y=0, training_frame=d)
    ns = mh2o1.coef().keys()
    print(ns)
    assert ("DPROS.None"
            in ns, "None level IS NOT expected to be skipped by default")
    assert (("DPROS.Both"
             not in ns), "Both level IS expected to be skipped by default")
    x = d["DPROS"].relevel("None")
    print(x)
    d["DPROS"] = x[0]

    mh2o2 = H2OGeneralizedLinearEstimator(family="binomial",
                                          Lambda=0,
                                          missing_values_handling="Skip")
    mh2o2.train(x=list(range(1, d.ncol)), y=0, training_frame=d)
    ns2 = mh2o2.coef().keys()
    print(ns2)
    assert ("DPROS.None"
            in ns2, "None level IS NOT expected to be skipped by default")
    assert (("DPROS.Both"
             not in ns2), "Both level IS expected to be skipped by default")

    #Second, compare against R input (taken from runit_relevel.R)
    dr = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    dr["DPROS"] = d["DPROS"].relevel("None")
    #Results are from R but manualy reordered and renamed to match h2o naming and order
    exp_coefs = {
        "Intercept": -7.63245,
        "DPROS.Both": 1.39185,
        "DPROS.Left": 0.73482,
        "DPROS.Right": 1.51437,
        "RACE.White": 0.65160,
        "DCAPS.Yes": 0.49233,
        "AGE": -0.01189,
        "PSA": 0.02990,
        "VOL": -0.01141,
        "GLEASON": 0.96466927
    }
    coeff_diff = {
        key: abs(exp_coefs[key] - mh2o2.coef().get(key, 0))
        for key in exp_coefs.keys()
    }
    assert (max(coeff_diff.values()) < 1e-4)
示例#9
0
def test_lambda_warning():
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv"))
    Y = 13
    X = list(range(13))
    model = H2OGeneralizedLinearEstimator(family="Gaussian", lambda_search=True, Lambda=[0.01])
    model.train(x=X, y=Y, training_frame=training_data)

    with pyunit_utils.catch_warnings() as ws:
        model = H2OGeneralizedLinearEstimator(family="Gaussian", lambda_search=True, Lambda=[0.01])
        model.train(x=X, y=Y, training_frame=training_data)

        assert pyunit_utils.contains_warning(ws, 'disabled when user specified any lambda value(s)')
示例#10
0
def link_functions_gamma():
    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")
        ).open("prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 5]
    sm_data_features = sm_data[:, [1, 2, 3, 4, 6, 7, 8, 9]]

    print("Testing for family: GAMMA")
    print("Set variables for h2o.")
    myY = "DPROS"
    myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "CAPSULE"]

    print("Create models with canonical link: INVERSE")
    h2o_model_in = H2OGeneralizedLinearEstimator(family="gamma",
                                                 link="inverse",
                                                 alpha=0.5,
                                                 Lambda=0)
    h2o_model_in.train(x=myX, y=myY, training_frame=h2o_data)

    sm_model_in = sm.GLM(endog=sm_data_response,
                         exog=sm_data_features,
                         family=sm.families.Gamma(
                             sm.families.links.inverse_power)).fit()

    print("Compare model deviances for link function inverse")
    h2o_deviance_in = old_div(h2o_model_in.residual_deviance(),
                              h2o_model_in.null_deviance())
    sm_deviance_in = old_div(sm_model_in.deviance, sm_model_in.null_deviance)
    assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures"

    print("Create models with canonical link: LOG")
    h2o_model_log = H2OGeneralizedLinearEstimator(family="gamma",
                                                  link="log",
                                                  alpha=0.5,
                                                  Lambda=0)
    h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data)
    sm_model_log = sm.GLM(endog=sm_data_response,
                          exog=sm_data_features,
                          family=sm.families.Gamma(
                              sm.families.links.log)).fit()

    print("Compare model deviances for link function log")
    h2o_deviance_log = old_div(h2o_model_log.residual_deviance(),
                               h2o_model_log.null_deviance())
    sm_deviance_log = old_div(sm_model_log.deviance,
                              sm_model_log.null_deviance)
    assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
示例#11
0
def buildModelCheckStdCoeffs(training_fileName, family):
    training_data = h2o.import_file(pyunit_utils.locate(training_fileName))
    ncols = training_data.ncols
    Y = ncols - 1
    x = list(range(0, Y))
    enumCols = Y / 2
    if family == 'binomial' or family == 'multinomial':
        training_data[Y] = training_data[Y].asfactor()  #
    for ind in range(int(enumCols)):  # first half of the columns are enums
        training_data[ind] = training_data[ind].asfactor()
    model1 = H2OGeneralizedLinearEstimator(family=family, standardize=True)
    model1.train(training_frame=training_data, x=x, y=Y)
    stdCoeff1 = model1.coef_norm()
    modelNS = H2OGeneralizedLinearEstimator(family=family, standardize=False)
    modelNS.train(training_frame=training_data, x=x, y=Y)

    coeffNSStandardized = modelNS.coef_norm()
    coeffNS = modelNS.coef()
    if family == 'multinomial':
        nclass = len(coeffNS)
        for cind in range(nclass):
            coeff1PerClass = coeffNSStandardized["std_coefs_class_" +
                                                 str(cind)]
            coeff2PerClass = coeffNS["coefs_class_" + str(cind)]
            print("Comparing multinomial coefficients for class {0}".format(
                cind))
            assert_coeffs_equal(coeff1PerClass, coeff2PerClass, training_data)
    else:  # for binomial and gaussian
        assert_coeffs_equal(coeffNSStandardized, coeffNS, training_data)

    # standardize numerical columns here
    for ind in range(
            int(enumCols),
            Y):  # change the numerical columns to have mean 0 and std 1
        aver = training_data[ind].mean()
        sigma = 1.0 / math.sqrt(training_data[ind].var())
        training_data[ind] = (training_data[ind] - aver) * sigma

    model2 = H2OGeneralizedLinearEstimator(family=family, standardize=False)
    model2.train(training_frame=training_data, x=x, y=Y)
    coeff2 = model2.coef_norm()
    compare_coeffs_2_model(
        family, stdCoeff1, coeff2
    )  # make sure standardized coefficients from model 1 and 2 are the same

    # this part of the test is to check and make sure the changes I made int coef() and coef_norm() accurately
    # capture the correct coefficients.
    coeff2Coef = model2.coef(
    )  # = coeff2 since training data are standardized already
    compare_coeffs_2_model(
        family, coeff2, coeff2Coef, sameModel=True
    )  # make sure coefficients from coef_norm and coef are the same
示例#12
0
def shuffling_large():
    print("Reading in Arcene training data for binomial modeling.")
    train_data = h2o.upload_file(path=pyunit_utils.locate(
        "smalldata/arcene/shuffle_test_version/arcene.csv"))
    train_data_shuffled = h2o.upload_file(path=pyunit_utils.locate(
        "smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))

    print("Create model on original Arcene dataset.")
    h2o_model = H2OGeneralizedLinearEstimator(family="binomial",
                                              lambda_search=True,
                                              alpha=0.5)
    h2o_model.train(x=range(1000), y=1000, training_frame=train_data)

    print("Create second model on original Arcene dataset.")
    h2o_model_2 = H2OGeneralizedLinearEstimator(family="binomial",
                                                lambda_search=True,
                                                alpha=0.5)
    h2o_model_2.train(x=range(1000), y=1000, training_frame=train_data)

    print("Create model on shuffled Arcene dataset.")
    h2o_model_s = H2OGeneralizedLinearEstimator(family="binomial",
                                                lambda_search=True,
                                                alpha=0.5)
    h2o_model_s.train(x=range(1000),
                      y=1000,
                      training_frame=train_data_shuffled)

    print(
        "Assert that number of predictors remaining and their respective coefficients are equal."
    )

    for x, y in zip(
            h2o_model._model_json['output']['coefficients_table'].cell_values,
            h2o_model_2._model_json['output']
        ['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(
            y[2])), "coefficients should be the same type"
        if isinstance(x[1], float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2], float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

    for x, y in zip(
            h2o_model._model_json['output']['coefficients_table'].cell_values,
            h2o_model_s._model_json['output']
        ['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(
            y[2])), "coefficients should be the same type"
        if isinstance(x[1], float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2], float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
示例#13
0
def test_HGLM_R():
    h2o_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/semiconductor.csv"))
    y = "y"
    x = ["x1", "x3", "x5", "x6"]
    z = [0]
    tot = 1e-4
    h2o_data[0] = h2o_data[0].asfactor()
    start_vals = [
        0.001929687, 0.002817188, -0.001707812, -0.003889062, 0.010685937, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0.1
    ]
    h2o_glm = H2OGeneralizedLinearEstimator(HGLM=True,
                                            family="gaussian",
                                            rand_family=["gaussian"],
                                            random_columns=z,
                                            calc_like=True)
    h2o_glm.train(x=x, y=y, training_frame=h2o_data)
    modelMetrics = h2o_glm.training_model_metrics()

    h2o_glm_start_val = H2OGeneralizedLinearEstimator(HGLM=True,
                                                      family="gaussian",
                                                      rand_family=["gaussian"],
                                                      random_columns=z,
                                                      calc_like=True,
                                                      startval=start_vals)
    h2o_glm_start_val.train(x=x, y=y, training_frame=h2o_data)
    modelMetricsSV = h2o_glm_start_val.training_model_metrics()

    # compare model metrics from both models and they should be the same
    metricsNames = [
        "hlik", "pvh", "dfrefe", "varfix", "pbvh", "convergence", "caic",
        "sumetadiffsquare"
    ]
    metricsNamesArrays = [
        "summvc1",
        "sefe",
        "varranef",
        "ranef",
        "sere",
        "fixef",
    ]

    for ind in range(len(metricsNames)):
        assert abs(modelMetrics[metricsNames[ind]]-modelMetricsSV[metricsNames[ind]]) < tot, "expected {0}: {1}, " \
                                                                                             "actual {0}: {2}".format(metricsNames[ind], modelMetrics[metricsNames[ind]], modelMetricsSV[metricsNames[ind]])
    for ind in range(len(metricsNamesArrays)):
        pyunit_utils.equal_two_arrays(modelMetrics[metricsNamesArrays[ind]],
                                      modelMetricsSV[metricsNamesArrays[ind]],
                                      1e-10, tot)
def link_correct_default():
    print("Reading in original prostate data.")
    h2o_data = h2o.upload_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv.zip"))

    print("Compare models with link unspecified and canonical link specified.")
    print("GAUSSIAN: ")
    h2o_model_unspecified = H2OGeneralizedLinearEstimator(family="gaussian")
    h2o_model_unspecified.train(x=list(range(1, 8)),
                                y=8,
                                training_frame=h2o_data)

    h2o_model_specified = H2OGeneralizedLinearEstimator(family="gaussian",
                                                        link="identity")
    h2o_model_specified.train(x=list(range(1, 8)),
                              y=8,
                              training_frame=h2o_data)

    assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
           h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

    print("BINOMIAL: ")
    h2o_model_unspecified = H2OGeneralizedLinearEstimator(family="binomial")
    h2o_model_unspecified.train(x=list(range(2, 9)),
                                y=1,
                                training_frame=h2o_data)

    h2o_model_specified = H2OGeneralizedLinearEstimator(family="binomial",
                                                        link="logit")
    h2o_model_specified.train(x=list(range(2, 9)),
                              y=1,
                              training_frame=h2o_data)
    assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
           h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

    print("POISSON: ")
    h2o_model_unspecified = H2OGeneralizedLinearEstimator(family="poisson")
    h2o_model_unspecified.train(x=list(range(2, 9)),
                                y=1,
                                training_frame=h2o_data)
    h2o_model_specified = H2OGeneralizedLinearEstimator(family="poisson",
                                                        link="log")
    h2o_model_specified.train(x=list(range(2, 9)),
                              y=1,
                              training_frame=h2o_data)
    assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
           h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

    print("GAMMA: ")
    h2o_model_unspecified = H2OGeneralizedLinearEstimator(family="gamma")
    h2o_model_unspecified.train(x=list(range(3, 9)),
                                y=2,
                                training_frame=h2o_data)
    h2o_model_specified = H2OGeneralizedLinearEstimator(family="gamma",
                                                        link="inverse")
    h2o_model_specified.train(x=list(range(3, 9)),
                              y=2,
                              training_frame=h2o_data)
    assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
           h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"
示例#15
0
def buildModelCheckStdCoeffs(training_fileName, family):
    training_data = h2o.import_file(pyunit_utils.locate(training_fileName))
    ncols = training_data.ncols
    Y = ncols - 1
    x = list(range(0, Y))
    enumCols = Y / 2
    if family == 'binomial' or family == 'multinomial':
        training_data[Y] = training_data[Y].asfactor()  #
    for ind in range(int(enumCols)):  # first half of the columns are enums
        training_data[ind] = training_data[ind].asfactor()
    model1 = H2OGeneralizedLinearEstimator(family=family, standardize=True)
    model1.train(training_frame=training_data, x=x, y=Y)
    stdCoeff1 = model1.coef_norm()
    modelNS = H2OGeneralizedLinearEstimator(family=family, standardize=False)
    modelNS.train(training_frame=training_data, x=x, y=Y)

    coeffNSStandardized = modelNS.coef_norm()
    coeffNS = modelNS.coef()
    if family == 'multinomial':
        nclass = len(coeffNS)
        for cind in range(nclass):
            coeff1PerClass = coeffNSStandardized["std_coefs_class_" +
                                                 str(cind)]
            coeff2PerClass = coeffNS["coefs_class_" + str(cind)]
            print("Comparing multinomial coefficients for class {0}".format(
                cind))
            assert_coeffs_equal(coeff1PerClass, coeff2PerClass, training_data)
    else:  # for binomial and gaussian
        assert_coeffs_equal(coeffNSStandardized, coeffNS, training_data)

    # standardize numerical columns here
    for ind in range(
            int(enumCols),
            Y):  # change the numerical columns to have mean 0 and std 1
        aver = training_data[ind].mean()
        sigma = 1.0 / math.sqrt(training_data[ind].var())
        training_data[ind] = (training_data[ind] - aver) * sigma

    model2 = H2OGeneralizedLinearEstimator(family=family, standardize=False)
    model2.train(training_frame=training_data, x=x, y=Y)
    coeff2 = model2.coef_norm()

    if family == 'multinomial':  # special treatment, it contains a dict of dict
        assert len(stdCoeff1) == len(coeff2), "Coefficient dictionary lengths are different.  One has length {0} while" \
                                              " the other one has length {1}.".format(len(stdCoeff1), len(coeff2))
        for name in stdCoeff1.keys():
            pyunit_utils.equal_two_dicts(stdCoeff1[name], coeff2[name])
    else:
        pyunit_utils.equal_two_dicts(stdCoeff1, stdCoeff1)
def interactions():
  df = h2o.import_file(pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"))
  XY = [df.names[i-1] for i in [1,2,3,4,6,8,9,13,17,18,19,31]]
  interactions = [XY[i-1] for i in [5,7,9]]
  assert interactions == ["CRSDepTime", "UniqueCarrier", "Origin"]
  m = H2OGeneralizedLinearEstimator(lambda_search=True, family="binomial", interactions=interactions)
  m.train(x=XY[:len(XY)], y=XY[-1],training_frame=df)
  coef_m = m._model_json['output']['coefficients_table']

  interaction_pairs = [("CRSDepTime", "UniqueCarrier"), ("CRSDepTime", "Origin"), ("UniqueCarrier", "Origin")]
  mexp = H2OGeneralizedLinearEstimator(lambda_search=True, family="binomial", interaction_pairs=interaction_pairs)
  mexp.train(x=XY[:len(XY)], y=XY[-1],training_frame=df)
  coef_mexp = mexp._model_json['output']['coefficients_table']

  assert coef_m["names"] == coef_mexp["names"]
示例#17
0
def link_functions_tweedie_vpow():
  # Load example data from HDtweedie, y = aggregate claim loss
  hdf = h2o.upload_file(pyunit_utils.locate("smalldata/glm_test/auto.csv"))
  y = "y"
  x = list(set(hdf.names) - set(["y"]))

  print("Testing for family: TWEEDIE")
  print("Create models with canonical link: TWEEDIE")
  # Iterate over different variance powers for tweedie
  vpower = [0, 1, 1.5]
  r_dev = [0.7516627, 0.6708826, 0.7733762]
  r_null = [221051.88369951, 32296.29783702, 20229.47425307]
  for ridx, vpow in enumerate(vpower):
    print("Fit h2o.glm:")
    h2ofit = H2OGeneralizedLinearEstimator(family="tweedie",
                                           link="tweedie",
                                           tweedie_variance_power=vpow,
                                           tweedie_link_power=1-vpow,
                                           alpha=0.5,
                                           Lambda=0)
    h2ofit.train(x=x,y=y, training_frame=hdf)

    print("Testing Tweedie variance power: {0}".format(vpow))

    print("Compare model deviances for link function tweedie")
    deviance_h2o_tweedie = old_div(h2ofit.residual_deviance(), h2ofit.null_deviance())

    assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \
                                                       "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx])

    print("compare null and residual deviance between R glm and h2o.glm for tweedie")
    assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \
                                                              "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def test_glm_backward_compare():
    tst_data = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv"))
    predictors = tst_data.columns[0:-1]
    response_col = 'response'
    weight = 'wt'
    tst_data['wt'] = 1
    tst_data[tst_data['response'] == 1, 'wt'] = 100
    tst_data['response'] = tst_data['response'].asfactor()
    min_predictor_num = 200
    backward_model = H2OModelSelectionEstimator(
        family='binomial',
        weights_column=weight,
        mode='backward',
        min_predictor_number=min_predictor_num)
    backward_model.train(predictors, response_col, training_frame=tst_data)
    backward_model_coeff = backward_model.coef()[0]
    glm_model = H2OGeneralizedLinearEstimator(family='binomial',
                                              lambda_=0,
                                              compute_p_values=True,
                                              weights_column=weight)
    glm_model.train(predictors, response_col, training_frame=tst_data)
    glm_coeff = glm_model.coef()
    pyunit_utils.assertEqualCoeffDicts(glm_coeff,
                                       backward_model_coeff,
                                       tol=1e-6)
def toy_classifications():
    # train, valid, test = fr.split_frame([0.6, 0.2], seed=1234) # simply subsets of fr
    train, valid, test = divide_train_test(fr)


    m = H2OGeneralizedLinearEstimator(family="binomial")
    features = ['VatRatio','LocalVatRatio','TurnoverGross','TotalReturnCount','RefundClaimedBoolean'] + share_cols
    m.train(x=features, y="y", training_frame=train)
    m.confusion_matrix()
    # or m.model_performance() or simply m


    # m = H2ODeepLearningEstimator()
    m.train(x=features, y="y", training_frame=train, validation_frame=valid)
    m.confusion_matrix(valid=True)
    plt.plot(*m.roc(valid=1))
    # m.model_performance(test_data=test)

    # Random Forest
    var_y = 'y'
    rf_v1 = H2ORandomForestEstimator(
        model_id="rf_v1",
        ntrees=200,
        stopping_rounds=2,
        score_each_iteration=True,
        seed=1000000)

    rf_v1.train(features, var_y, training_frame=train, validation_frame=valid)
    rf_v1.confusion_matrix(valid=1)
    # plt.plot(*rf_v1.roc(valid=1))
    plot_betas(rf_v1.roc(valid=1))
示例#20
0
def glm_solvers():
    training_data = h2o.import_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    predictors = ["displacement", "power", "weight", "acceleration", "year"]

    for solver in [
            "AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE",
            "COORDINATE_DESCENT"
    ]:
        print "Solver = {0}".format(solver)
        for family in ["binomial", "gaussian", "poisson", "tweedie", "gamma"]:
            if family == 'binomial': response_col = "economy_20mpg"
            elif family == 'gaussian': response_col = "economy"
            else: response_col = "cylinders"
            print "Family = {0}".format(family)

            if family == 'binomial':
                training_data[response_col] = training_data[
                    response_col].asfactor()
            else:
                training_data[response_col] = training_data[
                    response_col].asnumeric()

            model = H2OGeneralizedLinearEstimator(family=family,
                                                  alpha=0,
                                                  Lambda=1e-5,
                                                  solver=solver)
            model.train(x=predictors,
                        y=response_col,
                        training_frame=training_data)
def test_HGLM_R():
    tot = 1e-6
    h2o_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/HGLM_5KRows_100Z.csv"),
        col_types=[
            "enum", "enum", "enum", "enum", "numeric", "numeric", "numeric",
            "numeric"
        ])
    y = "response"
    x = ["enum1", "enum2", "enum3", "num1", "num2", "num3"]
    z = 0
    h2o_glm = H2OGeneralizedLinearEstimator(HGLM=True,
                                            family="gaussian",
                                            rand_family=["gaussian"],
                                            random_columns=[z],
                                            calc_like=True)
    h2o_glm.train(x=x, y=y, training_frame=h2o_data)
    modelMetrics = h2o_glm.training_model_metrics()
    rmodelMetrics = {
        "hlik": -23643.3076231,
        "caic": 47019.7968491,
        "pvh": -23491.5738429,
        "pbvh": -23490.2982034,
        "dfrefe": 4953.0,
        "varfix": 703.86912057
    }

    metricsNames = ["hlik", "caic", "pvh", "pbvh", "dfrefe", "varfix"]
    for kNames in metricsNames:
        assert abs(rmodelMetrics[kNames]-modelMetrics[kNames])<tot,"for {2}, Expected from R: {0}, actual from H2O-3: " \
                                                               "{1}".format(rmodelMetrics[kNames], modelMetrics[kNames], kNames)
示例#22
0
    def predict_from_standalone_lr(self, train, test, valid, x, y,
                                   prediction_field_name):
        """Produces an H2O dataframe containing a field with predictions from logistic regression model.

        :param train: the training H2O dataframe
        :param test: the testing H2O dataframe
        :param valid: the validation H2O dataframe
        :param x: the feature variables
        :param y: the target variable
        :param prediction_field_name: the name to use for field to contain predictions
        :returns: the H2O dataframe with prediction field and all fields from the supplied dataframe
        """
        print("Logistic Regression")
        lr_standalone = H2OGeneralizedLinearEstimator(model_id='glm_v1',
                                                      family='binomial',
                                                      link='logit',
                                                      solver='L_BFGS')

        lr_standalone.train(x=x,
                            y=y,
                            training_frame=train,
                            validation_frame=valid)

        print("train[y].levels():", train[y].levels()[0])
        y_level_count = train[y].nlevels()[0]
        print("y_level_count:", y_level_count)
        print("AUC (training):", lr_standalone.auc(train=True))
        print("AUC (validation):", lr_standalone.auc(valid=True))

        lr_standalone_predictions = lr_standalone.predict(test)
        print("Logistic Regression Predictions:")
        print(lr_standalone_predictions.head(rows=5))
        return (self.set_prediction_field_name(lr_standalone_predictions,
                                               prediction_field_name))
示例#23
0
def test_prostate():

    h2o_data = h2o.upload_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    h2o_data.summary()

    sm_data = pd.read_csv(
        pyunit_utils.locate("smalldata/logreg/prostate.csv")).as_matrix()
    sm_data_response = sm_data[:, 1]
    sm_data_features = sm_data[:, 2:]

    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                            nfolds=10,
                                            alpha=0.5)
    h2o_glm.train(x=list(range(2, h2o_data.ncol)),
                  y=1,
                  training_frame=h2o_data)
    sm_glm = sm.GLM(endog=sm_data_response,
                    exog=sm_data_features,
                    family=sm.families.Binomial()).fit()

    print("statsmodels null deviance {0}".format(sm_glm.null_deviance))
    print("h2o null deviance {0}".format(h2o_glm.null_deviance()))
    assert abs(sm_glm.null_deviance - h2o_glm.null_deviance()
               ) < 1e-5, "Expected null deviances to be the same"
示例#24
0
def pyunit_make_glm_model():
    # TODO: PUBDEV-1717
    pros = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))

    model = H2OGeneralizedLinearEstimator(family="gaussian", alpha=[0])
    model.train(x=["AGE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"],
                y="CAPSULE",
                training_frame=pros)
    #    model = h2o.glm(x=pros[["AGE","DPROS","DCAPS","PSA","VOL","GLEASON"]], y=pros["CAPSULE"], family="gaussian", alpha=[0])
    new_betas = {
        "AGE": 0.5,
        "DPROS": 0.5,
        "DCAPS": 0.5,
        "PSA": 0.5,
        "VOL": 0.5,
        "GLEASON": 0.5
    }

    names = '['
    for n in list(new_betas.keys()):
        names += "\"" + n + "\","
    names = names[0:len(names) - 1] + "]"
    betas = '['

    for b in list(new_betas.values()):
        betas += str(b) + ","
    betas = betas[0:len(betas) - 1] + "]"
    res = h2o.H2OConnection.post_json("MakeGLMModel",
                                      model=model._id,
                                      names=names,
                                      beta=betas)
def save_load_model():
    prostate = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    prostate_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                                 alpha=[0.5])
    prostate_glm.train(x=["AGE", "RACE", "PSA", "DCAPS"],
                       y="CAPSULE",
                       training_frame=prostate)
    path = pyunit_utils.locate("results")

    assert os.path.isdir(
        path), "Expected save directory {0} to exist, but it does not.".format(
            path)
    model_path = h2o.save_model(prostate_glm, path=path, force=True)

    assert os.path.isfile(
        model_path
    ), "Expected load file {0} to exist, but it does not.".format(model_path)
    the_model = h2o.load_model(model_path)

    assert isinstance(
        the_model,
        H2OEstimator), "Expected and H2OBinomialModel, but got {0}".format(
            the_model)
示例#26
0
def std_coef_plot_test():
    kwargs = {}
    kwargs['server'] = True

    # import data set
    cars = h2o.import_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

    # Constructing validation and train sets by sampling (20/80)
    s = cars[0].runif()
    cars_train = cars[s <= 0.8]
    cars_valid = cars[s > 0.8]

    # set list of features, target, and convert target to factor
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    response_col = "economy_20mpg"
    cars[response_col] = cars[response_col].asfactor()

    # Build and train a GLM model
    cars_glm = H2OGeneralizedLinearEstimator()
    cars_glm.train(x=predictors,
                   y=response_col,
                   training_frame=cars_train,
                   validation_frame=cars_valid)

    # Plot GLM standardized coefficient magnitudes and check that num_of_features accepts input
    cars_glm.std_coef_plot(server=True)
    cars_glm.std_coef_plot(num_of_features=2, server=True)
示例#27
0
def h2oapi():
    """
    Python API test: h2o.api(endpoint, data=None, json=None, filename=None, save_to=None)
    """
    try:
        training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
        Y = 3
        X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]

        model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5)
        model.train(x=X, y=Y, training_frame=training_data)
        frame_api = h2o.api("GET /3/Frames/%s/summary" % training_data.frame_id)
        assert_is_type(frame_api, H2OResponse)
        hf_col_summary = h2o.api("GET /3/Frames/%s/summary" % training_data.frame_id)["frames"][0]
        # test h2o.api() getting frame information
        assert hf_col_summary["row_count"]==100, "row count is incorrect.  Fix h2o.api()."
        assert hf_col_summary["column_count"]==14, "column count is incorrect.  Fix h2o.api()."

        # test h2o.api() getting model information
        model_api = h2o.api("GET /3/GetGLMRegPath", data={"model": model._model_json["model_id"]["name"]})
        assert_is_type(model_api, H2OResponse)
        model_coefficients = model_api["coefficients"][0]
        assert len(model_coefficients)==11, "Number of coefficients is wrong.  h2o.api() command is not working."
    except Exception as e:
        assert False, "h2o.api() command not is working."
示例#28
0
def test_hdfs_io():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv")

    print("Spliting data")
    for c in ["Month","DayofMonth","IsArrDelayed"]:
        h2o_data[c] = h2o_data[c].asfactor()
    myX = ["Month","DayofMonth","Distance"]
    train,test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv")
    assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
示例#29
0
def _init_model(args):
    from h2o.estimators.glm import H2OGeneralizedLinearEstimator
    return H2OGeneralizedLinearEstimator(
        nfolds=args.n_folds,
        family="binomial",
        lambda_search=False,
        seed=args.random_seed)
示例#30
0
def h2odownload_pojo():
    """
    Python API test: h2o.download_pojo(model, path=u'', get_jar=True)

    Copied from glm_download_pojo.py
    """
    try:
        h2o_df = h2o.import_file(
            pyunit_utils.locate("smalldata/prostate/prostate.csv"))
        h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor()
        binomial_fit = H2OGeneralizedLinearEstimator(family="binomial")
        binomial_fit.train(y="CAPSULE",
                           x=["AGE", "RACE", "PSA", "GLEASON"],
                           training_frame=h2o_df)
        try:
            results_dir = pyunit_utils.locate(
                "results")  # find directory path to results folder
            h2o.download_pojo(binomial_fit, path=results_dir)
            assert os.path.isfile(os.path.join(results_dir, "h2o-genmodel.jar")), "h2o.download_pojo() " \
                                                                                  "command is not working."
        except:
            h2o.download_pojo(
                binomial_fit
            )  # just print pojo to screen if directory does not exists
    except Exception as e:
        assert False, "h2o.download_pojo() command is not working."