示例#1
0
def glm_alpha_lambda_arrays_cv():
    print("Testing glm cross-validation with alpha array, lambda array for binomial models.")
    h2o_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    
    # choices made in model_all and model_xval should be the same since they should be using xval metrics
    model_all = glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True)
    model_all.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    model_all_rpath = glm.getGLMRegularizationPath(model_all)
    model_xval =  glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True)
    model_xval.train(x=myX, y=myY, training_frame = training_data)
    model_xval_rpath = glm.getGLMRegularizationPath(model_xval)

    for l in range(0,len(model_all_rpath['lambdas'])):
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients'][l], model_xval_rpath['coefficients'][l], tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients_std'][l], model_xval_rpath['coefficients_std'][l], tol=1e-6)
def reg_path_glm():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    m = glm(family='binomial',lambda_search=True,solver='COORDINATE_DESCENT')
    m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m,coefs=r['coefficients'][10])
    dev1 = r['explained_deviance_train'][10]
    p = m2.model_performance(d)
    dev2 = 1-p.residual_deviance()/p.null_deviance()
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0,len(r['lambdas'])):
        m = glm(family='binomial',lambda_search=False,Lambda=r['lambdas'][l],solver='COORDINATE_DESCENT')
        m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        diff = 0
        diff2 = 0
        for n in cs.keys():
            diff = max(diff,abs((cs[n] - m.coef()[n])))
            diff2 = max(diff2,abs((cs_norm[n] - m.coef_norm()[n])))
        print(diff)
        print(diff2)
        assert diff < 1e-2
        assert diff2 < 1e-2
        p = m.model_performance(d)
        devm = 1-p.residual_deviance()/p.null_deviance()
        devn = r['explained_deviance_train'][l]
        print(devm)
        print(devn)
        assert abs(devm - devn) < 1e-4
示例#3
0
def testGLMGaussianScoringHistory():
    col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_rmse", "validation_rmse",
                        "training_mae", "validation_mae", "training_deviance", "validation_deviance"]

    h2o_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    
    # build gaussian model with score_each_interval to true
    model = glm(family="gaussian", score_each_iteration=True)
    model.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    # build gaussian model with score_iteration_interval to 1
    model_score_each = glm(family="gaussian", score_iteration_interval=1)
    model_score_each.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assert_equal_scoring_history(model, model_score_each, col_list_compare)

    # build gaussian model with score_each_interval to true, with CV
    model_cv = glm(family="gaussian", score_each_iteration=True, nfolds=3, fold_assignment='modulo', seed=1234)
    model_cv.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    # build gaussian model with score_iteration_interval to 1, with CV
    model_score_each_cv = glm(family="gaussian", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234)
    model_score_each_cv.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assert_equal_scoring_history(model_cv, model_score_each_cv, col_list_compare)
    model_cv_4th = glm(family="gaussian", score_iteration_interval=4, nfolds=3, fold_assignment='modulo', seed=1234)
    model_cv_4th.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assertEqualScoringHistoryIteration(model_cv_4th, model_cv, col_list_compare)
def glm_alpha_array_with_lambda_search_cv():
    # read in the dataset and construct training set (and validation set)
    print("Testing glm cross-validation with alpha array, lambda_search for multiomial models.")
    h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    # build model with CV but no validation dataset
    cv_model = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3)
    cv_model.train(training_frame=training_data,x=myX,y=myY)
    cv_r = glm.getGLMRegularizationPath(cv_model)
    # build model with CV and with validation dataset
    cv_model_valid = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3)
    cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY)
    cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid)

    for l in range(0,len(cv_r['lambdas'])):
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)
def glm_alpha_lambda_arrays():
    # compare coefficients and deviance when only training dataset is available
    train = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    for ind in range(10):
        train[ind] = train[ind].asfactor()
    train["C21"] = train["C21"].asfactor()
    frames = train.split_frame(ratios=[0.8], seed=12345)
    d = frames[0]
    d_test = frames[1]
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]

    # compare results when validation dataset is present
    mLVal = glm(family='binomial',
                alpha=[0.1, 0.5],
                lambda_search=True,
                solver='COORDINATE_DESCENT',
                nlambdas=3)  # train with validations set
    mLVal.train(training_frame=d,
                x=list(range(20)),
                y=20,
                validation_frame=d_test)
    rVal = glm.getGLMRegularizationPath(mLVal)
    best_submodel_indexVal = mLVal._model_json["output"]["best_submodel_index"]
    m2Val = glm.makeGLMModel(
        model=mLVal, coefs=rVal['coefficients'][best_submodel_indexVal])
    dev1Val = rVal['explained_deviance_valid'][best_submodel_indexVal]
    p2Val = m2Val.model_performance(d_test)
    dev2Val = 1 - p2Val.residual_deviance() / p2Val.null_deviance()
    assert abs(dev1Val - dev2Val) < 1e-6
    for l in range(0, len(rVal['lambdas'])):
        m = glm(family='binomial',
                alpha=[rVal['alphas'][l]],
                Lambda=rVal['lambdas'][l],
                solver='COORDINATE_DESCENT')
        m.train(training_frame=d,
                x=list(range(20)),
                y=20,
                validation_frame=d_test)
        mr = glm.getGLMRegularizationPath(m)
        p = m.model_performance(d_test)
        cs = rVal['coefficients'][l]
        cs_norm = rVal['coefficients_std'][l]
        print("Comparing submodel index {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-1)
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-1)
        pyunit_utils.assertEqualRegPaths(regKeys, rVal, l, mr, tol=1e-3)
        dVal = 1 - p.residual_deviance() / p.null_deviance()
        if l == best_submodel_indexVal:  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["validation_metrics"],
                mLVal._model_json["output"]["validation_metrics"],
                tol=1e-2)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert dVal <= dev2Val, "Best submodel does not have highest explained deviance_valid for submodel: !".format(
                l)
def glm_alpha_array_lambda_null():
    # first test: compare coefficients and deviance
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    mL = glm(family='multinomial',
             alpha=[0.1, 0.5, 0.9],
             lambda_search=True,
             solver='COORDINATE_DESCENT',
             cold_start=True,
             nlambdas=5)
    d[54] = d[54].asfactor()
    mL.train(training_frame=d, x=list(range(0, 54)), y=54)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    coefClassSet = [
        'coefs_class_0', 'coefs_class_1', 'coefs_class_2', 'coefs_class_3',
        'coefs_class_4', 'coefs_class_5', 'coefs_class_6', 'coefs_class_7'
    ]
    coefClassSetNorm = [
        'std_coefs_class_0', 'std_coefs_class_1', 'std_coefs_class_2',
        'std_coefs_class_3', 'std_coefs_class_4', 'std_coefs_class_5',
        'std_coefs_class_6', 'std_coefs_class_7'
    ]
    for l in range(0, len(r['lambdas'])):
        print("compare models for index {0}, alpha {1}, lambda{2}".format(
            l, r['alphas'][l], r['lambdas'][l]))
        m = glm(family='multinomial',
                alpha=[r['alphas'][l]],
                Lambda=[r['lambdas'][l]],
                solver='COORDINATE_DESCENT')
        m.train(training_frame=d, x=list(range(0, 54)), y=54)
        mr = glm.getGLMRegularizationPath(m)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        pyunit_utils.assertCoefEqual(cs, m.coef(), coefClassSet)
        pyunit_utils.assertCoefEqual(cs_norm, m.coef_norm(), coefClassSetNorm)
        devm = 1 - m.residual_deviance() / m.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-6
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr)
        if (l == best_submodel_index
            ):  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["training_metrics"],
                mL._model_json["output"]["training_metrics"],
                keySet=["MSE", "null_deviance", "logloss", "RMSE", "r2"],
                tol=5e-1)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert devm <= r['explained_deviance_train'][best_submodel_index], "Best submodel does not best " \
                                                                    "explained_deviance_train!"
def glm_alpha_lambda_arrays():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial',
             Lambda=[0.9, 0.5, 0.1],
             alpha=[0.1, 0.5, 0.9],
             solver='COORDINATE_DESCENT')
    mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    m2 = glm.makeGLMModel(model=mL,
                          coefs=r['coefficients'][best_submodel_index])
    dev1 = r['explained_deviance_train'][best_submodel_index]
    p2 = m2.model_performance(d)
    dev2 = 1 - p2.residual_deviance() / p2.null_deviance()
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0, len(r['lambdas'])):
        m = glm(family='binomial',
                alpha=[r['alphas'][l]],
                Lambda=[r['lambdas'][l]],
                solver='COORDINATE_DESCENT')
        m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
        mr = glm.getGLMRegularizationPath(m)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        diff = 0
        diff2 = 0
        for n in cs.keys():
            diff = max(diff, abs((cs[n] - m.coef()[n])))
            diff2 = max(diff2, abs((cs_norm[n] - m.coef_norm()[n])))
        assert diff < 1e-2
        assert diff2 < 1e-2
        p = m.model_performance(d)
        devm = 1 - p.residual_deviance() / p.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-5)
        if (l == best_submodel_index
            ):  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["training_metrics"],
                mL._model_json["output"]["training_metrics"],
                tol=1e-5)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \
                                                                    "residual_deviance()!"
def glm_alpha_array_lambda_null():
    # first test: compare coefficients and deviance
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    mL = glm(family='multinomial', alpha=[0.1, 0.5, 0.9])
    d[54] = d[54].asfactor()
    mL.train(training_frame=d, x=list(range(0, 54)), y=54)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    coefClassSet = [
        'coefs_class_0', 'coefs_class_1', 'coefs_class_2', 'coefs_class_3',
        'coefs_class_4', 'coefs_class_5', 'coefs_class_6', 'coefs_class_7'
    ]
    coefClassSetNorm = [
        'std_coefs_class_0', 'std_coefs_class_1', 'std_coefs_class_2',
        'std_coefs_class_3', 'std_coefs_class_4', 'std_coefs_class_5',
        'std_coefs_class_6', 'std_coefs_class_7'
    ]
    for l in range(0, len(r['lambdas'])):
        m = glm(family='multinomial',
                alpha=[r['alphas'][l]],
                Lambda=[r['lambdas'][l]])
        m.train(training_frame=d, x=list(range(0, 54)), y=54)
        mr = glm.getGLMRegularizationPath(m)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        pyunit_utils.assertCoefEqual(cs, m.coef(), coefClassSet, tol=1e-5)
        pyunit_utils.assertCoefEqual(cs_norm,
                                     m.coef_norm(),
                                     coefClassSetNorm,
                                     tol=1e-5)
        devm = 1 - m.residual_deviance() / m.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr)
        if (l == best_submodel_index
            ):  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["training_metrics"],
                mL._model_json["output"]["training_metrics"],
                tol=1e-2)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert m.logloss() >= mL.logloss(), "Best submodel does not have lowerest " \
                                                                    "logloss()!"
def test_makeGLMModel():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    m = glm(family='gaussian', Lambda=[0.001], alpha=[0.5])
    m.train(training_frame=d, x=myX, y=myY)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0])
    f1 = m.predict(d)  # predict with original model
    f2 = m2.predict(d)  # predict with model out of makeGLMModel
    pyunit_utils.compare_frames_local(f1, f2, prob=1)
    coefs = r['coefficients'][0]
    coefs['wendy_dreams'] = 8

    try:
        glm.makeGLMModel(model=m, coefs=coefs)
        assert False, "Should have throw exception of bad coefficient length"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 9 is different from coefficient provided by user ") in temp, \
            "Wrong exception was received."
        print("coefficient test passed!")
示例#10
0
def test_glm_multinomial_coeffs():
    trainF = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    y = "species"
    x = [0,1,2,3]
    bin_LS = glm(family='multinomial', seed=12345)
    bin_LS.train(x=x, y=y, training_frame=trainF)
    print(bin_LS.summary())
    coefficient_table_original = bin_LS._model_json["output"]["coefficients_table"]
    coefficient_table = bin_LS._model_json["output"]["coefficients_table_multinomials_with_class_names"]

    coeffNamesOld = coefficient_table_original.col_header
    coeffNames = coefficient_table.col_header
    validCoefficientNames = [u"names", u"coefs_class_Iris-setosa", u"coefs_class_Iris-versicolor",
                             u"coefs_class_Iris-virginica", u"std_coefs_class_Iris-setosa",
                             u"std_coefs_class_Iris-versicolor", u"std_coefs_class_Iris-virginica"]
    oldCoefficientNames = [u"names", u"coefs_class_0", u"coefs_class_1",
                             u"coefs_class_2", u"std_coefs_class_0",
                             u"std_coefs_class_1", u"std_coefs_class_2"]
    print(coefficient_table)
    print(coefficient_table_original)

    # compare coefficient names
    assert len(set(coeffNames).intersection(validCoefficientNames))==len(coeffNames),\
        "Expected coefficient names: {0}.  Actual coefficient names: {1}".format(validCoefficientNames, coeffNames)
    assert len(set(coeffNamesOld).intersection(oldCoefficientNames))==len(coeffNames), \
        "Expected original coefficient names: {0}.  Actual original coefficient names: " \
        "{1}".format(oldCoefficientNames, coeffNamesOld)

    # compare table contents to make sure they contain the same values
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(coefficient_table_original, coefficient_table, [u'coefs_class_0'],
                                                  tolerance=1e-10)
def test_glm_multinomial_makeGLMModel():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    mL = glm(family='multinomial', alpha=[0.1], Lambda=[0.9])
    d[54] = d[54].asfactor()
    mL.train(training_frame=d, x=list(range(0, 54)), y=54)
    r = glm.getGLMRegularizationPath(mL)
    rank = check_nonzero_coefs(r['coefficients'][0])
    assert rank == mL._model_json["output"]["rank"], "expected rank: {0}, actual rank: {1}." \
                                                     "".format(rank, mL._model_json["output"]["rank"])
    m2 = glm.makeGLMModel(
        model=mL, coefs=r['coefficients']
        [0])  # model generated from setting coefficients to model
    f1 = mL.predict(d)
    f2 = m2.predict(d)
    pyunit_utils.compare_frames_local(f1, f2, prob=1)

    coefs = r['coefficients'][0]
    coefs[
        "wendy_dreams"] = 0.123  # add extra coefficients to model coefficient

    try:
        glm.makeGLMModel(model=mL, coefs=coefs)
        assert False, "Should have thrown an exception!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
           ("model coefficient length 371 is different from coefficient provided by user") in temp, \
            "Wrong exception was received."
        print("glm Multinomial makeGLMModel test completed!")
def testOrdinalLogit():
    Dtrain = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/glm_ordinal_logit/ordinal_ordinal_20_training_set.csv"
        ))
    Dtrain["C21"] = Dtrain["C21"].asfactor()

    print("Fit model on dataset")
    model = glm(family="ordinal",
                alpha=[0.5],
                lambda_=[0.001],
                max_iterations=1000,
                beta_epsilon=1e-8,
                objective_epsilon=1e-8)
    model.train(x=list(range(0, 20)), y="C21", training_frame=Dtrain)
    predH2O = model.predict(Dtrain)
    r = glm.getGLMRegularizationPath(model)
    m2 = glm.makeGLMModel(
        model=model, coefs=r['coefficients']
        [0])  # model generated from setting coefficients to model
    f2 = m2.predict(Dtrain)
    pyunit_utils.compare_frames_local(predH2O, f2, prob=1)
    coefs = r['coefficients'][0]
    coefs['h2o_dream'] = 3.1415

    try:
        glm.makeGLMModel(model=model, coefs=coefs)
        assert False, "Should have thrown an exception!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 189 is different from coefficient provided by user ") in temp, \
            "Wrong exception was received."
        print("coefficient test passed!")
def test_makeGLMModel():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    m = glm(family='binomial',
            Lambda=[0.001],
            alpha=[0.5],
            solver='COORDINATE_DESCENT')
    m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0])
    f1 = m.predict(d)  # predict with original model
    f2 = m2.predict(d)  # predict with model out of makeGLMModel
    pyunit_utils.compare_frames_local(f1[1], f2[1], prob=1)
    coefs = r['coefficients'][0]
    coefs['wendy_dreams'] = 8

    try:
        glm.makeGLMModel(model=m, coefs=coefs)
        assert False, "Test failed: should have throw exception of bad coefficient length!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 8 is different from coefficient provided by user ") in temp,\
            "Wrong exception was received."
        print("makeGLMModel test passed!")
示例#14
0
def test_glm_scoring_history_TomasF():
    df = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    df["CAPSULE"] = df["CAPSULE"].asfactor()

    glmModel = glm(generate_scoring_history=True)
    glmModel.train(y="CAPSULE", training_frame=df)
    glmModel.scoring_history()
def testGLMBinomialScoringHistory():
    col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_logloss", "validation_logloss",
                        "training_classification_error", "validation_classification_error", "training_rmse", 
                        "validation_rmse", "training_auc", "validation_auc", "training_pr_auc", "validation_pr_auc",
                        "training_lift", "validation_lift"]
    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    for ind in range(10):
        h2o_data[ind] = h2o_data[ind].asfactor()
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234)
    train = splits_frames[0]
    valid = splits_frames[1]
    Y = "C21"
    X = list(range(0,20))

    print("Building model with score_interval=1.  Should generate same model as "
          "score_each_iteration turned on.")
    h2o_model = glm(family="binomial", score_iteration_interval=1)
    h2o_model.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    print("Building model with score_each_iteration turned on.")
    h2o_model_score_each = glm(family="binomial", score_each_iteration=True)
    h2o_model_score_each.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    # scoring history from h2o_model_score_each and h2o_model should be the same
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model, col_list_compare)

    print("Building model with score_each_iteration turned on, with  CV.")
    h2o_model_score_each_cv = glm(family="binomial", score_each_iteration=True, nfolds=3, fold_assignment='modulo', 
                                  seed=1234)
    h2o_model_score_each_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    print("Building model with score_interval=1, and CV.  Should generate same model as score_each_iteration turned "
          "on, with lambda search and CV.")
    h2o_model_cv = glm(family="binomial", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234)
    h2o_model_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    # scoring history from h2o_model_score_each_cv and h2o_model_cv should be the same
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv, h2o_model_cv, col_list_compare)

    # check if scoring_interval is set to 4, the output should be the same for every fourth iteration
    h2o_model_cv_4th = glm(family="binomial", score_iteration_interval=4, nfolds=3, fold_assignment='modulo', seed=1234)
    h2o_model_cv_4th.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv, h2o_model_cv_4th, col_list_compare)
def set_glm_startvals():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial')
    mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    mLcoeff = mL.coef()
    r = glm.getGLMRegularizationPath(mL)
    rcoeff = r["coefficients"][0]
    responseMean = d[1].mean()
    initIntercept = math.log(responseMean/(1.0-responseMean))
    startval1 = [0,0,0,0,0,0,0,initIntercept]
    startval2 = [rcoeff["AGE"], rcoeff["RACE"], rcoeff["DPROS"], rcoeff["DCAPS"], rcoeff["PSA"], rcoeff["VOL"], 
                rcoeff["GLEASON"], rcoeff["Intercept"]]
    startvalBad = [0,0]
    
    ml1 = glm(family="binomial", startval = startval1) # same starting condition as GLM
    ml1.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    ml1Coeff = ml1.coef()
    pyunit_utils.assertEqualCoeffDicts(mLcoeff, ml1Coeff , tol = 1e-6) # coeffs should be the same

    ml2 = glm(family="binomial", startval = startval2) # different starting condition from GLM
    ml2.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    ml2Coeff = ml2.coef()   
    
    try:
        pyunit_utils.assertEqualCoeffDicts(mLcoeff, ml2Coeff , tol = 1e-6)
        assert False, "Should have thrown an error as coefficients are different!"        
    except Exception as ex:
        print(ex)
    
    try:
        mlbad =  glm(family="binomial", startval = startvalBad)
        mlbad.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
        assert False, "Should have thrown an error with bad GLM initial values!"
    except Exception as ex:
        print(ex)
        print("Test completed!  Success!")
def binomial_plot_test():
    benign = h2o.import_file(
        pyunit_utils.locate("smalldata/logreg/benign.csv"))
    response = 3
    predictors = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]
    model = glm(family="binomial")
    model.train(x=predictors, y=response, training_frame=benign)

    # test saving:
    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)
        test_plot_result_saving(
            model.plot(timestep="AUTO", metric="objective", server=True),
            path2,
            model.plot(timestep="AUTO",
                       metric="objective",
                       server=True,
                       save_plot_path=path1), path1)

        test_plot_result_saving(
            model.permutation_importance_plot(benign), path2,
            model.permutation_importance_plot(benign, save_plot_path=path1),
            path1)
示例#18
0
def test_modelselection_gaussian():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    model_maxr = modelSelection(seed=12345,
                                max_predictor_number=3,
                                mode="maxr")
    model_maxr.train(training_frame=d, x=my_x, y=my_y)
    model_allsubsets = modelSelection(seed=12345,
                                      max_predictor_number=3,
                                      mode="allsubsets")
    model_allsubsets.train(training_frame=d, x=my_x, y=my_y)
    best_r2_value_allsubsets = model_allsubsets.get_best_R2_values()
    best_predictor_names_allsubsets = model_allsubsets.get_best_model_predictors(
    )
    best_r2_value_maxr = model_maxr.get_best_R2_values()

    # assert that model returned with one predictor found by modelselection is the best by comparing it to manual training result
    one_pred_r2 = []
    for pred in my_x:
        x = [pred]
        m = glm(seed=12345)
        m.train(training_frame=d, x=x, y=my_y)
        one_pred_r2.append(m.r2())
    best_r2 = max(one_pred_r2)
    assert abs(best_r2-best_r2_value_allsubsets[0]) < 1e-6, "expected best r2: {0}, allsubset: actual best r2:{1}. " \
                                                            " They are different.".format(best_r2, best_r2_value_allsubsets[0])
    assert abs(best_r2-best_r2_value_maxr[0]) < 1e-6, "expected best r2: {0}, maxr: actual best r2:{1}. " \
                                                      " They are different.".format(best_r2, best_r2_value_maxr[0])
    assert abs(best_r2_value_allsubsets[0]-best_r2_value_maxr[0]) < 1e-6, "allsubset best r2: {0}, maxr best r2:{1}. " \
                                                                          " They are different." \
                                                                          "".format(best_r2_value_allsubsets[0],
                                                                                    best_r2_value_maxr[0])

    print("Best one predictor model uses predictor: {0}".format(
        best_predictor_names_allsubsets[0]))

    my_x3 = [["AGE", "RACE", "CAPSULE"], ["AGE", "RACE", "DCAPS"],
             ["AGE", "RACE", "PSA"], ["AGE", "RACE", "VOL"],
             ["AGE", "RACE", "DPROS"], ["AGE", "CAPSULE", "DCAPS"],
             ["AGE", "CAPSULE", "PSA"], ["AGE", "CAPSULE", "VOL"],
             ["AGE", "CAPSULE", "DPROS"], ["AGE", "DCAPS", "PSA"],
             ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "VOL"],
             ["AGE", "DCAPS", "DPROS"], ["AGE", "PSA", "VOL"],
             ["AGE", "PSA", "VOL"], ["AGE", "PSA", "DPROS"],
             ["AGE", "VOL", "DPROS"], ["RACE", "CAPSULE", "DCAPS"],
             ["RACE", "CAPSULE", "PSA"], ["RACE", "CAPSULE", "VOL"],
             ["RACE", "CAPSULE", "DPROS"], ["RACE", "DCAPS", "PSA"],
             ["RACE", "DCAPS", "VOL"], ["RACE", "DCAPS", "DPROS"],
             ["RACE", "PSA", "VOL"], ["RACE", "PSA", "DPROS"],
             ["RACE", "VOL", "DPROS"], ["CAPSULE", "DCAPS", "PSA"],
             ["CAPSULE", "DCAPS", "VOL"], ["CAPSULE", "DCAPS", "DPROS"],
             ["DCAPS", "PSA", "VOL"], ["DCAPS", "PSA", "DPROS"],
             ["DCAPS", "VOL", "DPROS"], ["PSA", "VOL", "DPROS"]]
    two_pred_r2 = []
    for pred2 in my_x3:
        x = pred2
        m = glm(seed=12345)
        m.train(training_frame=d, x=x, y=my_y)
        two_pred_r2.append(m.r2())
    best_r2_two_pred = max(two_pred_r2)
    assert abs(best_r2_two_pred-best_r2_value_allsubsets[2]) < 1e-6, "expected best r2: {0}, allsubsets: actual best " \
                                                                     "r2:{1}.  They are different." \
                                                     "".format(best_r2_two_pred, best_r2_value_allsubsets[2])
    assert abs(best_r2_two_pred-best_r2_value_maxr[2]) < 1e-6, "expected best r2: {0}, maxr: actual best " \
                                                                     "r2:{1}.  They are different." \
                                                                     "".format(best_r2_two_pred, best_r2_value_maxr[2])
    assert abs(best_r2_value_allsubsets[2]-best_r2_value_maxr[2]) < 1e-6, "allsubset best r2: {0}, maxr: actual best " \
                                                               "r2:{1}.  They are different." \
                                                               "".format(best_r2_value_allsubsets[2], best_r2_value_maxr[2])
    print("Best three predictors model uses predictors: {0}".format(
        best_predictor_names_allsubsets[2]))