Exemplo n.º 1
0
def testGLMGaussianScoringHistory():
    col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_rmse", "validation_rmse",
                        "training_mae", "validation_mae", "training_deviance", "validation_deviance"]

    h2o_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    
    # build gaussian model with score_each_interval to true
    model = glm(family="gaussian", score_each_iteration=True)
    model.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    # build gaussian model with score_iteration_interval to 1
    model_score_each = glm(family="gaussian", score_iteration_interval=1)
    model_score_each.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assert_equal_scoring_history(model, model_score_each, col_list_compare)

    # build gaussian model with score_each_interval to true, with CV
    model_cv = glm(family="gaussian", score_each_iteration=True, nfolds=3, fold_assignment='modulo', seed=1234)
    model_cv.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    # build gaussian model with score_iteration_interval to 1, with CV
    model_score_each_cv = glm(family="gaussian", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234)
    model_score_each_cv.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assert_equal_scoring_history(model_cv, model_score_each_cv, col_list_compare)
    model_cv_4th = glm(family="gaussian", score_iteration_interval=4, nfolds=3, fold_assignment='modulo', seed=1234)
    model_cv_4th.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    pyunit_utils.assertEqualScoringHistoryIteration(model_cv_4th, model_cv, col_list_compare)
def testGLMBinomialScoringHistory():
    col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_logloss", "validation_logloss",
                        "training_classification_error", "validation_classification_error", "training_rmse", 
                        "validation_rmse", "training_auc", "validation_auc", "training_pr_auc", "validation_pr_auc",
                        "training_lift", "validation_lift"]
    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    for ind in range(10):
        h2o_data[ind] = h2o_data[ind].asfactor()
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234)
    train = splits_frames[0]
    valid = splits_frames[1]
    Y = "C21"
    X = list(range(0,20))

    print("Building model with score_interval=1.  Should generate same model as "
          "score_each_iteration turned on.")
    h2o_model = glm(family="binomial", score_iteration_interval=1)
    h2o_model.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    print("Building model with score_each_iteration turned on.")
    h2o_model_score_each = glm(family="binomial", score_each_iteration=True)
    h2o_model_score_each.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    # scoring history from h2o_model_score_each and h2o_model should be the same
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model, col_list_compare)

    print("Building model with score_each_iteration turned on, with  CV.")
    h2o_model_score_each_cv = glm(family="binomial", score_each_iteration=True, nfolds=3, fold_assignment='modulo', 
                                  seed=1234)
    h2o_model_score_each_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    print("Building model with score_interval=1, and CV.  Should generate same model as score_each_iteration turned "
          "on, with lambda search and CV.")
    h2o_model_cv = glm(family="binomial", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234)
    h2o_model_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    # scoring history from h2o_model_score_each_cv and h2o_model_cv should be the same
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv, h2o_model_cv, col_list_compare)

    # check if scoring_interval is set to 4, the output should be the same for every fourth iteration
    h2o_model_cv_4th = glm(family="binomial", score_iteration_interval=4, nfolds=3, fold_assignment='modulo', seed=1234)
    h2o_model_cv_4th.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv, h2o_model_cv_4th, col_list_compare)
Exemplo n.º 3
0
def test_glm_scoring_history_multinomial():
    col_list_compare = [
        "iterations", "training_logloss", "validation_logloss",
        "training_classification_error", "validation_classification_error",
        "deviance_train", "deviance_test"
    ]
    print("Preparing dataset....")
    h2o_data = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
        ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    h2o_data["C3"] = h2o_data["C3"].asfactor()
    h2o_data["C4"] = h2o_data["C4"].asfactor()
    h2o_data["C5"] = h2o_data["C5"].asfactor()
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234)
    train = splits_frames[0]
    valid = splits_frames[1]

    print(
        "Building model with score_each_iteration turned on, with lambda search."
    )
    h2o_model_score_each = glm(family="multinomial",
                               score_each_iteration=True,
                               lambda_search=True,
                               nlambdas=10,
                               generate_scoring_history=True)
    h2o_model_score_each.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                               y="C11",
                               training_frame=train,
                               validation_frame=valid)
    print(
        "Building model with score_interval=1.  Should generate same model as score_each_iteration turned on."
    )
    h2o_model = glm(family="multinomial",
                    score_iteration_interval=1,
                    lambda_search=True,
                    nlambdas=10,
                    generate_scoring_history=True)
    h2o_model.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                    y="C11",
                    training_frame=train,
                    validation_frame=valid)
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model,
                                              col_list_compare)

    col_list_compare.append("deviance_xval")
    col_list_compare.append("deviance_se")
    print(
        "Building model with score_each_iteration turned on, with lambda search and CV."
    )
    h2o_model_score_each_cv = glm(family="multinomial",
                                  score_each_iteration=True,
                                  lambda_search=True,
                                  nlambdas=10,
                                  nfolds=2,
                                  fold_assignment='modulo',
                                  generate_scoring_history=True)
    h2o_model_score_each_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                                  y="C11",
                                  training_frame=train,
                                  validation_frame=valid)
    print(
        "Building model with score_interval=1.  Should generate same model as score_each_iteration turned on, with "
        "lambda search and CV.")
    h2o_model_cv = glm(family="multinomial",
                       score_iteration_interval=1,
                       lambda_search=True,
                       nlambdas=10,
                       nfolds=2,
                       fold_assignment='modulo',
                       generate_scoring_history=True)
    h2o_model_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                       y="C11",
                       training_frame=train,
                       validation_frame=valid)
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv,
                                              h2o_model_cv, col_list_compare)

    h2o_model_4th_cv = glm(family="multinomial",
                           score_iteration_interval=4,
                           lambda_search=True,
                           nlambdas=10,
                           nfolds=2,
                           fold_assignment='modulo',
                           generate_scoring_history=True)
    h2o_model_4th_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                           y="C11",
                           training_frame=train,
                           validation_frame=valid)
    pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv,
                                                    h2o_model_4th_cv,
                                                    col_list_compare)
    print("Done")
def test_glm_scoring_history_multinomial():
    col_list_compare = [
        "iterations", "objective", "negative_log_likelihood",
        "training_logloss", "validation_logloss",
        "training_classification_error", "validation_classification_error",
        "deviance_train", "deviance_test"
    ]
    print("Preparing dataset....")
    h2o_data = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
        ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    h2o_data["C3"] = h2o_data["C3"].asfactor()
    h2o_data["C4"] = h2o_data["C4"].asfactor()
    h2o_data["C5"] = h2o_data["C5"].asfactor()
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234)
    train = splits_frames[0]
    valid = splits_frames[1]

    print("Building model with score_each_iteration turned on.")
    h2o_model_score_each = glm(family="multinomial",
                               score_each_iteration=True,
                               generate_scoring_history=True)
    h2o_model_score_each.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                               y="C11",
                               training_frame=train,
                               validation_frame=valid)
    print(
        "Building model with score_interval=1.  Should generate same model as score_each_iteration turned on."
    )
    h2o_model = glm(family="multinomial",
                    score_iteration_interval=1,
                    generate_scoring_history=True)
    h2o_model.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                    y="C11",
                    training_frame=train,
                    validation_frame=valid)
    # scoring history from h2o_model_score_each and h2o_model should be the same
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model,
                                              col_list_compare)

    print(
        "Building model with score_each_iteration turned on and cross-validaton on."
    )
    h2o_model_score_each_cv = glm(family="multinomial",
                                  score_each_iteration=True,
                                  nfolds=2,
                                  seed=1234,
                                  fold_assignment="modulo",
                                  generate_scoring_history=True)
    h2o_model_score_each_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                                  y="C11",
                                  training_frame=train,
                                  validation_frame=valid)
    print(
        "Building model with score_interval=1 and cross-validation on.  Should generate same model as "
        "score_each_iteration and cv turned on.")
    h2o_model_cv = glm(family="multinomial",
                       score_iteration_interval=1,
                       nfolds=2,
                       fold_assignment="modulo",
                       seed=1234,
                       generate_scoring_history=True)
    h2o_model_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                       y="C11",
                       training_frame=train,
                       validation_frame=valid)
    # scoring history from h2o_model_score_each_cv and h2o_model_cv should be the same
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv,
                                              h2o_model_cv, col_list_compare)

    # check if scoring_interval is set to 4, the output should be the same for every fourth iteration
    print(
        "Building model with score_interval=4 and cross-validation on.  Should generate same model as "
        "other models and same scoring history at the correct iteration.")
    h2o_model_cv_4th = glm(family="multinomial",
                           score_iteration_interval=3,
                           nfolds=2,
                           fold_assignment="modulo",
                           seed=1234,
                           generate_scoring_history=True)
    h2o_model_cv_4th.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                           y="C11",
                           training_frame=train,
                           validation_frame=valid)
    pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv,
                                                    h2o_model_cv_4th,
                                                    col_list_compare)
def testGLMBinomialScoringHistoryLambdaSearch():
    col_list_compare = [
        "iteration", "training_logloss", "validation_logloss",
        "training_classification_error", "validation_classification_error",
        "training_rmse", "validation_rmse", "training_auc", "validation_auc",
        "training_pr_auc", "validation_pr_auc", "training_lift",
        "validation_lift", "deviance_train", "deviance_test"
    ]
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    for ind in range(10):
        h2o_data[ind] = h2o_data[ind].asfactor()
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234)
    train = splits_frames[0]
    valid = splits_frames[1]
    Y = "C21"
    X = list(range(0, 20))

    print(
        "Building model with score_interval=1 and lambda search on.  Should generate same model as "
        "score_each_iteration turned on.  However, in this case, no scoring history is generated at "
        "every iteration due to speed constraint.")
    h2o_model = glm(family="binomial",
                    score_iteration_interval=1,
                    lambda_search=True,
                    nlambdas=10)
    h2o_model.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    print(
        "Building model with score_each_iteration turned on, with lambda search."
    )
    h2o_model_score_each = glm(family="binomial",
                               score_each_iteration=True,
                               lambda_search=True,
                               nlambdas=10)
    h2o_model_score_each.train(x=X,
                               y=Y,
                               training_frame=train,
                               validation_frame=valid)
    # scoring history from h2o_model_score_each and h2o_model should be the same
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model,
                                              col_list_compare)

    print(
        "Building model with score_each_iteration turned on, with lambda search and CV."
    )
    h2o_model_score_each_cv = glm(family="binomial",
                                  score_each_iteration=True,
                                  lambda_search=True,
                                  nlambdas=10,
                                  nfolds=2,
                                  fold_assignment='modulo')
    h2o_model_score_each_cv.train(x=X,
                                  y=Y,
                                  training_frame=train,
                                  validation_frame=valid)
    print(
        "Building model with score_interval=1, lambda search on and CV.  Should generate same model as "
        "score_each_iteration turned on, with lambda search and CV.")
    h2o_model_cv = glm(family="binomial",
                       score_iteration_interval=1,
                       lambda_search=True,
                       nlambdas=10,
                       nfolds=2,
                       fold_assignment='modulo')
    h2o_model_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    col_list_compare.append("deviance_xval")
    col_list_compare.append("deviance_se")
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv,
                                              h2o_model_cv, col_list_compare)

    # lambda search does not respect user choice for score_iteration_interval.  Scoring history should be the same
    # no matter what interval you specify.  Since scoring is only done at the end, no regular training metrics are
    # available.
    col_list_compare = [
        "iteration", "deviance_train", "deviance_test", "deviance_xval",
        "deviance_se"
    ]
    h2o_model_4th_cv = glm(family="binomial",
                           score_iteration_interval=4,
                           lambda_search=True,
                           nlambdas=10,
                           nfolds=2,
                           fold_assignment='modulo')
    h2o_model_4th_cv.train(x=X,
                           y=Y,
                           training_frame=train,
                           validation_frame=valid)
    pyunit_utils.assert_equal_scoring_history(h2o_model_cv, h2o_model_4th_cv,
                                              col_list_compare)
def testGLMBinomialScoringHistoryLambdaSearch():
    col_list_compare = [
        "iterations", "training_logloss", "validation_logloss",
        "training_classification_error", "validation_classification_error",
        "training_rmse", "validation_rmse", "training_auc", "validation_auc",
        "training_pr_auc", "validation_pr_auc", "training_lift",
        "validation_lift", "deviance_train", "deviance_test"
    ]
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    for ind in range(10):
        h2o_data[ind] = h2o_data[ind].asfactor()
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234)
    train = splits_frames[0]
    valid = splits_frames[1]
    Y = "C21"
    X = list(range(0, 20))

    print(
        "Building model with score_interval=1 and lambda search on.  Should generate same model as "
        "score_each_iteration turned on.")
    h2o_model = glm(family="binomial",
                    score_iteration_interval=1,
                    lambda_search=True,
                    nlambdas=10,
                    generate_scoring_history=True)
    h2o_model.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    print(
        "Building model with score_each_iteration turned on, with lambda search."
    )
    h2o_model_score_each = glm(family="binomial",
                               score_each_iteration=True,
                               lambda_search=True,
                               nlambdas=10,
                               generate_scoring_history=True)
    h2o_model_score_each.train(x=X,
                               y=Y,
                               training_frame=train,
                               validation_frame=valid)
    # scoring history from h2o_model_score_each and h2o_model should be the same
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model,
                                              col_list_compare)

    print(
        "Building model with score_each_iteration turned on, with lambda search and CV."
    )
    h2o_model_score_each_cv = glm(family="binomial",
                                  score_each_iteration=True,
                                  lambda_search=True,
                                  nlambdas=10,
                                  nfolds=2,
                                  fold_assignment='modulo',
                                  generate_scoring_history=True)
    h2o_model_score_each_cv.train(x=X,
                                  y=Y,
                                  training_frame=train,
                                  validation_frame=valid)
    print(
        "Building model with score_interval=1, lambda search on and CV.  Should generate same model as "
        "score_each_iteration turned on, with lambda search and CV.")
    h2o_model_cv = glm(family="binomial",
                       score_iteration_interval=1,
                       lambda_search=True,
                       nlambdas=10,
                       nfolds=2,
                       fold_assignment='modulo',
                       generate_scoring_history=True)
    h2o_model_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid)
    col_list_compare.append("deviance_xval")
    col_list_compare.append("deviance_se")
    pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv,
                                              h2o_model_cv, col_list_compare)

    h2o_model_4th_cv = glm(family="binomial",
                           score_iteration_interval=4,
                           lambda_search=True,
                           nlambdas=10,
                           nfolds=2,
                           fold_assignment='modulo',
                           generate_scoring_history=True)
    h2o_model_4th_cv.train(x=X,
                           y=Y,
                           training_frame=train,
                           validation_frame=valid)
    pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv,
                                                    h2o_model_4th_cv,
                                                    col_list_compare)
def testGLMGaussianScoringHistory():
    col_list_compare = [
        "iterations", "training_rmse", "validation_rmse", "training_mae",
        "validation_mae", "training_deviance", "validation_deviance"
    ]
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    enum_columns = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"
    ]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]

    # build gaussian model with score_each_interval to true
    model = glm(family="gaussian",
                score_each_iteration=True,
                lambda_search=True,
                nlambdas=10)
    model.train(x=myX,
                y=myY,
                training_frame=training_data,
                validation_frame=test_data)
    # build gaussian model with score_iteration_interval to 1
    model_score_each = glm(family="gaussian",
                           score_iteration_interval=1,
                           lambda_search=True,
                           nlambdas=10)
    model_score_each.train(x=myX,
                           y=myY,
                           training_frame=training_data,
                           validation_frame=test_data)
    pyunit_utils.assert_equal_scoring_history(model, model_score_each,
                                              col_list_compare)

    # build gaussian model with score_each_interval to true, with CV
    model_cv = glm(family="gaussian",
                   score_each_iteration=True,
                   nfolds=3,
                   fold_assignment='modulo',
                   seed=1234,
                   lambda_search=True,
                   nlambdas=10)
    model_cv.train(x=myX,
                   y=myY,
                   training_frame=training_data,
                   validation_frame=test_data)
    # build gaussian model with score_iteration_interval to 1, with CV
    model_score_each_cv = glm(family="gaussian",
                              score_iteration_interval=1,
                              nfolds=3,
                              fold_assignment='modulo',
                              seed=1234,
                              lambda_search=True,
                              nlambdas=10)
    model_score_each_cv.train(x=myX,
                              y=myY,
                              training_frame=training_data,
                              validation_frame=test_data)
    pyunit_utils.assert_equal_scoring_history(model_cv, model_score_each_cv,
                                              col_list_compare)

    # lambda search does not respect user choice for score_iteration_interval.  Scoring history should be the same
    # no matter what interval you specify.  Since scoring is only done at the end, no regular training metrics are
    # available.
    col_list_compare = [
        "iteration", "deviance_train", "deviance_test", "deviance_xval",
        "deviance_se"
    ]
    model_cv_4th = glm(family="gaussian",
                       score_iteration_interval=4,
                       nfolds=3,
                       fold_assignment='modulo',
                       seed=1234,
                       lambda_search=True,
                       nlambdas=10)
    model_cv_4th.train(x=myX,
                       y=myY,
                       training_frame=training_data,
                       validation_frame=test_data)
    pyunit_utils.assert_equal_scoring_history(model_score_each_cv,
                                              model_cv_4th, col_list_compare)