def test_infogram_personal_loan_plot():
    """
    checking plotting function of infogram for fair model
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = [
        "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage",
        "Securities Account", "CD Account", "Online", "CreditCard"
    ]
    infogram_model = H2OInfogram(seed=12345,
                                 protected_columns=["Age", "ZIP Code"])
    infogram_model.train(x=x, y=target, training_frame=fr)
    infogram_model.plot(server=True)

    infogram_model2 = H2OInfogram(seed=12345,
                                  protected_columns=["Age", "ZIP Code"],
                                  safety_index_threshold=0.05,
                                  relevance_index_threshold=0.05)
    infogram_model2.train(x=x, y=target, training_frame=fr)
    infogram_model2.plot(server=True)
    assert len(infogram_model.get_admissible_cmi()) <= len(
        infogram_model2.get_admissible_cmi())
예제 #2
0
def test_infogram_breast_cancer():
    """
    Simple breast cancer data test to check that core infogram is working:
     1. cmi/relevance in Frame equals to those passed in model.output. 
     2. when model and infogram parameters are specified, it uses the correct specification.
     3. cmi/relevance from Deep code aggree with ours.
    :return: 
    """
    deep_rel = [
        0.0040477989, 0.0974455315, 0.0086303713, 0.0041002103, 0.0037914745,
        0.0036801151, 0.0257819346, 0.2808010416, 0.0005372569, 0.0036280018,
        0.0032444598, 0.0002943119, 0.0026430897, 0.0262074332, 0.0033317064,
        0.0068812603, 0.0006185385, 0.0082121491, 0.0014562177, 0.0081786997,
        1.0000000000, 0.0894895310, 0.6187801784, 0.3302352775, 0.0021346433,
        0.0016077771, 0.0260198502, 0.3404628948, 0.0041384517, 0.0019399743
    ]
    deep_cmi = [
        0.00000000, 0.31823883, 0.52769230, 0.00000000, 0.00000000, 0.00000000,
        0.01183309, 0.67430653, 0.00000000, 0.00000000, 0.45443221, 0.00000000,
        0.24561013, 0.87720587, 0.31939378, 0.19370515, 0.00000000, 0.16463918,
        0.00000000, 0.00000000, 0.44830772, 1.00000000, 0.00000000, 0.00000000,
        0.62478098, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.64466111
    ]
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/wdbc_changed.csv"))
    target = "diagnosis"
    fr[target] = fr[target].asfactor()
    x = [
        "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
        "smoothness_mean", "compactness_mean", "concavity_mean",
        "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
        "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
        "compactness_se", "concavity_se", "concave_points_se", "symmetry_se",
        "fractal_dimension_se", "radius_worst", "texture_worst",
        "perimeter_worst", "area_worst", "smoothness_worst",
        "compactness_worst", "concavity_worst", "concave_points_worst",
        "symmetry_worst", "fractal_dimension_worst"
    ]
    infogram_model = H2OInfogram(seed=12345, top_n_features=50)
    infogram_model.train(x=x, y=target, training_frame=fr)

    # make sure our result matches Deep's
    pred_names, rel = infogram_model.get_all_predictor_relevance()
    x, cmi = infogram_model.get_all_predictor_cmi()
    assert deep_rel.sort() == rel.sort(), "Expected: {0}, actual: {1}".format(
        deep_rel, rel)
    assert deep_cmi.sort() == cmi.sort(), "Expected: {0}, actual: {1}".format(
        deep_cmi, cmi)

    gbm_params = {'ntrees': 3, 'max_depth': 5}
    infogram_model_gbm = H2OInfogram(seed=12345,
                                     top_n_features=50,
                                     algorithm='gbm',
                                     algorithm_params=gbm_params)
    infogram_model_gbm.train(x=x, y=target, training_frame=fr)
    x, cmi_gbm = infogram_model_gbm.get_all_predictor_cmi()
    assert abs(cmi_gbm[1]-cmi[1]) > 0.01, "CMI from infogram model with gbm using different number of trees should" \
                                              " be different but are not."
예제 #3
0
def test_infogram_breast_cancer_cv_fold_column():
    """
    Test to make sure cross-validation are implemented properly using fold_column
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/wdbc_changed.csv"))
    target = "diagnosis"
    fr[target] = fr[target].asfactor()

    x = [
        "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
        "smoothness_mean", "compactness_mean", "concavity_mean",
        "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
        "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
        "compactness_se", "concavity_se", "concave_points_se", "symmetry_se",
        "fractal_dimension_se", "radius_worst", "texture_worst",
        "perimeter_worst", "area_worst", "smoothness_worst",
        "compactness_worst", "concavity_worst", "concave_points_worst",
        "symmetry_worst", "fractal_dimension_worst"
    ]
    n_fold = 3
    infogram_model_cv = H2OInfogram(
        seed=12345, top_n_features=50, nfolds=n_fold,
        fold_assignment="modulo")  # model with cross-validation
    infogram_model_cv.train(x=x, y=target, training_frame=fr)
    relcmi_train_cv = infogram_model_cv.get_admissible_score_frame()
    relcmi_cv_cv = infogram_model_cv.get_admissible_score_frame(xval=True)

    fold_numbers = fr.modulo_kfold_column(n_folds=n_fold)
    fold_numbers.set_names(["fold_numbers"])
    fr = fr.cbind(fold_numbers)

    infogram_model_cv_fold_column = H2OInfogram(seed=12345,
                                                top_n_features=50,
                                                fold_column="fold_numbers")
    infogram_model_cv_fold_column.train(x=x, y=target, training_frame=fr)
    relcmi_train_cv_fold_column = infogram_model_cv_fold_column.get_admissible_score_frame(
    )
    relcmi_cv_cv_fold_column = infogram_model_cv_fold_column.get_admissible_score_frame(
        xval=True)

    # training rel cmi frames should all equal
    print("Comparing infogram data from training dataset")
    pyunit_utils.compare_frames_local(relcmi_train_cv,
                                      relcmi_train_cv_fold_column,
                                      prob=1)

    # cv rel cmi frames should be the same
    print("Comparing infogram data from cross-validation dataset")
    pyunit_utils.compare_frames_local(relcmi_cv_cv,
                                      relcmi_cv_cv_fold_column,
                                      prob=1)
예제 #4
0
def test_infogram_german_data():
    """
    Simple german data test to check that safe infogram is working:
     1. it generates the correct lists as Deep's original code.  
     2. when model and infogram parameters are specified, it uses the correct specification.
    :return: 
    """
    deep_rel = [
        1.00000000, 0.58302027, 0.43431236, 0.66177924, 0.53677082, 0.25084764,
        0.34379833, 0.13251726, 0.11473028, 0.09548423, 0.20398740, 0.16432640,
        0.06875276, 0.04870468, 0.12573930, 0.01382682, 0.04496173, 0.01273963
    ]
    deep_cmi = [
        0.84946975, 0.73020930, 0.58553936, 0.75780528, 1.00000000, 0.38461582,
        0.57575695, 0.30663930, 0.07604779, 0.19979514, 0.42293369, 0.20628365,
        0.25316918, 0.15096705, 0.24501686, 0.11296778, 0.13068605, 0.03841617
    ]
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/german_credit.csv"))
    target = "BAD"
    fr[target] = fr[target].asfactor()
    x = fr.names
    x.remove(target)
    x.remove("status_gender")
    x.remove("age")
    infogram_model = H2OInfogram(seed=12345,
                                 protected_columns=["status_gender", "age"],
                                 top_n_features=50)
    infogram_model.train(x=x, y=target, training_frame=fr)

    # make sure our result matches Deep's
    pred_names, rel = infogram_model.get_all_predictor_relevance()
    x, cmi = infogram_model.get_all_predictor_cmi()
    assert deep_rel.sort() == rel.sort(), "Expected: {0}, actual: {1}".format(
        deep_rel, rel)
    assert deep_cmi.sort() == cmi.sort(), "Expected: {0}, actual: {1}".format(
        deep_cmi, cmi)

    gbm_params = {'ntrees': 3}
    infogram_model_gbm_glm = H2OInfogram(
        seed=12345,
        protected_columns=["status_gender", "age"],
        top_n_features=50,
        algorithm='gbm',
        algorithm_params=gbm_params)
    infogram_model_gbm_glm.train(x=x, y=target, training_frame=fr)
    x, cmi_gbm_glm = infogram_model_gbm_glm.get_all_predictor_cmi()
    assert abs(cmi_gbm_glm[1]-cmi[1]) > 0.01, "CMI from infogram model with gbm using different number of trees should" \
                                              " be different but are not."
def test_infogram_iris_wrong_thresholds():
    """
    Simple Iris test to check that when wrong thresholds are specified for core infogram, warnings will
    be received
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/irisROriginal.csv"))
    target = "Species"
    fr[target] = fr[target].asfactor()
    x = fr.names
    x.remove(target)
    with pyunit_utils.catch_warnings() as ws:
        infogram_model = H2OInfogram(
            seed=12345,
            distribution='multinomial',
            safety_index_threshold=0.2,
            relevance_index_threshold=0.2,
            top_n_features=len(
                x))  # build infogram model with default settings
        infogram_model.train(x=x, y=target, training_frame=fr)
        assert len(
            ws
        ) == 2, "Expected two warnings but received {0} warnings instead.".format(
            len(ws))
        assert pyunit_utils.contains_warning(
            ws, 'index_threshold for core infogram runs.')
def test_infogram_personal_loan():
    """
    Simple Perosnal loan test to check that when wrong thresholds are specified, warnings should be
    generated.
    :return: 
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = [
        "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage",
        "Securities Account", "CD Account", "Online", "CreditCard"
    ]
    with pyunit_utils.catch_warnings() as ws:
        infogram_model = H2OInfogram(seed=12345,
                                     protected_columns=["Age", "ZIP Code"],
                                     top_n_features=len(x),
                                     net_information_threshold=0.2,
                                     total_information_threshold=0.2)
        infogram_model.train(x=x, y=target, training_frame=fr)
        assert len(
            ws
        ) == 2, "Expected two warnings but received {0} warnings instead.".format(
            len(ws))
        assert pyunit_utils.contains_warning(
            ws, 'information_threshold for fair infogram runs.')
def test_infogram_personal_loan():
    """
    Test to make sure predictor can be specified using infogram model. 
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = [
        "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage",
        "Securities Account", "CD Account", "Online", "CreditCard"
    ]
    infogram_model = H2OInfogram(seed=12345,
                                 protected_columns=["Age", "ZIP Code"])
    infogram_model.train(x=x, y=target, training_frame=fr)

    glm_model1 = H2OGeneralizedLinearEstimator()
    glm_model1.train(x=infogram_model._extract_x_from_model(),
                     y=target,
                     training_frame=fr)
    coef1 = glm_model1.coef()
    glm_model2 = H2OGeneralizedLinearEstimator()
    glm_model2.train(x=infogram_model, y=target, training_frame=fr)
    coef2 = glm_model2.coef()

    pyunit_utils.assertCoefDictEqual(coef1, coef2, tol=1e-6)
def test_infogram_iris_x_attributes():
    """
    Test to showcase that we can specify predictors using infogram model
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/irisROriginal.csv"))
    target = "Species"
    fr[target] = fr[target].asfactor()
    x = fr.names
    x.remove(target)

    infogram_model = H2OInfogram(
        seed=12345, distribution='multinomial'
    )  # build infogram model with default settings
    infogram_model.train(x=x, y=target, training_frame=fr)

    glm_model1 = H2OGeneralizedLinearEstimator(family='multinomial')
    glm_model1.train(x=infogram_model._extract_x_from_model(),
                     y=target,
                     training_frame=fr)
    coef1 = glm_model1.coef()
    glm_model2 = H2OGeneralizedLinearEstimator(family='multinomial')
    glm_model2.train(x=infogram_model, y=target, training_frame=fr)
    coef2 = glm_model2.coef()
    coef_classes = coef1.keys()
    for key in coef_classes:
        pyunit_utils.assertCoefDictEqual(coef1[key], coef2[key], tol=1e-6)
def test_infogram_personal_loan_cv_fold_column():
    """
    Make sure safe infogram works with validation frame and supports cross-validation
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = [
        "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage",
        "Securities Account", "CD Account", "Online", "CreditCard"
    ]
    n_fold = 3
    infogram_model_cv = H2OInfogram(seed=12345,
                                    protected_columns=["Age", "ZIP Code"],
                                    nfolds=n_fold,
                                    fold_assignment='modulo')
    infogram_model_cv.train(x=x, y=target,
                            training_frame=fr)  # model with cross-vdaliation

    fold_numbers = fr.modulo_kfold_column(n_folds=n_fold)
    fold_numbers.set_names(["fold_numbers"])
    fr = fr.cbind(fold_numbers)

    infogram_model_cv_fold_column = H2OInfogram(
        seed=12345,
        protected_columns=["Age", "ZIP Code"],
        fold_column="fold_numbers")
    infogram_model_cv_fold_column.train(
        x=x, y=target, training_frame=fr)  # cross-validation, validation

    print("compare rel cmi from training dataset")
    relcmi_train_cv = infogram_model_cv.get_admissible_score_frame()
    relcmi_train_cv_fold_column = infogram_model_cv_fold_column.get_admissible_score_frame(
    )
    pyunit_utils.compare_frames_local(relcmi_train_cv,
                                      relcmi_train_cv_fold_column,
                                      prob=1.0)

    print("compare rel cmi from cross-validation hold out")
    relcmi_cv_cv = infogram_model_cv.get_admissible_score_frame(xval=True)
    relcmi_cv_cv_fold_column = infogram_model_cv_fold_column.get_admissible_score_frame(
        xval=True)
    pyunit_utils.compare_frames_local(relcmi_cv_cv,
                                      relcmi_cv_cv_fold_column,
                                      prob=1.0)
예제 #10
0
def test_infogram_personal_loan_cv_valid():
    """
    Make sure safe infogram works with validation frame and supports cross-validation
    """
    fr = h2o.import_file(path=pyunit_utils.locate("smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = ["Experience","Income","Family","CCAvg","Education","Mortgage",
         "Securities Account","CD Account","Online","CreditCard"]
    splits = fr.split_frame(ratios=[0.80])
    train = splits[0]
    test = splits[1]
    infogram_model = H2OInfogram(seed = 12345, protected_columns=["Age","ZIP Code"]) # model on training dataset
    infogram_model.train(x=x, y=target, training_frame=train)
    infogram_model_v = H2OInfogram(seed = 12345, protected_columns=["Age","ZIP Code"]) # model with validation dataset
    infogram_model_v.train(x=x, y=target, training_frame=train, validation_frame=test)
    infogram_model_cv = H2OInfogram(seed = 12345, protected_columns=["Age","ZIP Code"], nfolds=3) 
    infogram_model_cv.train(x=x, y=target, training_frame=train)  # model with cross-vdaliation
    infogram_model_cv_v = H2OInfogram(seed = 12345, protected_columns=["Age","ZIP Code"], nfolds=3) 
    infogram_model_cv_v.train(x=x, y=target, training_frame=train, validation_frame=test) # cross-validation, validation
    
    print("compare rel cmi from training dataset")
    relcmi_train = infogram_model.get_admissible_score_frame()
    relcmi_train_v = infogram_model_v.get_admissible_score_frame()
    relcmi_train_cv = infogram_model_cv.get_admissible_score_frame()
    relcmi_train_cv_v = infogram_model_cv_v.get_admissible_score_frame()
    pyunit_utils.compare_frames_local(relcmi_train, relcmi_train_v, prob=1.0)
    pyunit_utils.compare_frames_local(relcmi_train_cv, relcmi_train_cv_v, prob=1.0)
    pyunit_utils.compare_frames_local(relcmi_train_cv, relcmi_train, prob=1.0)

    print("compare rel cmi from validation dataset")
    relcmi_valid_v = infogram_model_v.get_admissible_score_frame(valid=True)
    relcmi_valid_cv_v = infogram_model_cv_v.get_admissible_score_frame(valid=True)
    pyunit_utils.compare_frames_local(relcmi_valid_v, relcmi_valid_cv_v, prob=1.0)
    
    print("compare rel cmi from cross-validation hold out")
    relcmi_cv_cv = infogram_model_cv.get_admissible_score_frame(xval=True)
    relcmi_cv_cv_v = infogram_model_cv_v.get_admissible_score_frame(xval=True)
    pyunit_utils.compare_frames_local(relcmi_cv_cv, relcmi_cv_cv_v, prob=1.0)
def test_infogram_iris_plot():
    """
    Check to make sure infogram can be plotted
    :return: 
    """

    fr = h2o.import_file(path=pyunit_utils.locate("smalldata/admissibleml_test/irisROriginal.csv"))
    target = "Species"
    fr[target] = fr[target].asfactor()
    x = fr.names
    x.remove(target)
    
    infogram_model = H2OInfogram(seed = 12345, distribution = 'multinomial') # build infogram model with default settings
    infogram_model.train(x=x, y=target, training_frame=fr)
    infogram_model.plot(server=True) # make sure graph will not show

    infogram_model2 = H2OInfogram(seed = 12345, distribution = 'multinomial', net_information_threshold=0.05,
                                  total_information_threshold=0.05) # build infogram model with default settings
    infogram_model2.train(x=x, y=target, training_frame=fr)
    infogram_model2.plot(server=True)

    assert len(infogram_model.get_admissible_cmi()) <= len(infogram_model2.get_admissible_cmi())
예제 #12
0
def test_infogram_breast_cancer_cv_fold_column():
    """
    Test to make sure cross-validation are implemented properly using fold_column
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/wdbc_changed.csv"))
    target = "diagnosis"
    fr[target] = fr[target].asfactor()

    x = [
        "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
        "smoothness_mean", "compactness_mean", "concavity_mean",
        "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
        "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
        "compactness_se", "concavity_se", "concave_points_se", "symmetry_se",
        "fractal_dimension_se", "radius_worst", "texture_worst",
        "perimeter_worst", "area_worst", "smoothness_worst",
        "compactness_worst", "concavity_worst", "concave_points_worst",
        "symmetry_worst", "fractal_dimension_worst"
    ]
    splits = fr.split_frame(ratios=[0.80])
    train = splits[0]
    test = splits[1]
    n_fold = 5
    infogram_model_cv_valid = H2OInfogram(
        seed=12345, top_n_features=50, nfolds=n_fold,
        fold_assignment="modulo")  # model with cross-validation
    infogram_model_cv_valid.train(x=x,
                                  y=target,
                                  training_frame=train,
                                  validation_frame=test)
    infogram_model_cv_valid.plot(title="infogram from training dataset 1",
                                 server=True)
    infogram_model_cv_valid.plot(
        train=True,
        valid=True,
        title="infogram from traiing/validation dataset 1",
        server=True)
    infogram_model_cv_valid.plot(
        train=True,
        valid=True,
        xval=True,
        title="infogram from training/validation/cv holdout"
        " dataset 1",
        server=True)
    relcmi_valid = infogram_model_cv_valid.get_admissible_score_frame(
        valid=True)
    relcmi_cv = infogram_model_cv_valid.get_admissible_score_frame(xval=True)
    assert relcmi_valid.nrow == relcmi_cv.nrow
def test_infogram_iris():
    """
    Simple Iris test to check that core infogram is working:
     1. it generates the correct lists as Deep's original code.  
     2. check and make sure the frame contains the correct information.
     3. check the admissible features contains cmi and relevance >= 0.1
    :return: 
    """
    deep_rel = [0.009010006, 0.011170417, 0.755170945, 1.000000000]
    deep_cmi = [0.1038524, 0.7135458, 0.5745915, 1.0000000]
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/irisROriginal.csv"))
    target = "Species"
    fr[target] = fr[target].asfactor()
    x = fr.names
    x.remove(target)

    infogram_model = H2OInfogram(
        seed=12345, distribution='multinomial'
    )  # build infogram model with default settings
    infogram_model.train(x=x, y=target, training_frame=fr)

    # make sure frame returning all predictors, relevance and cmi contains correct value
    pred_names, rel = infogram_model.get_all_predictor_relevance()
    x, cmi = infogram_model.get_all_predictor_cmi()
    predictor_rel_cmi_frame = infogram_model.get_admissible_score_frame(
    )  # get relevance and cmi frame
    assert_list_frame_equal(cmi, rel, predictor_rel_cmi_frame)

    # make sure our result matches Deep's
    assert deep_rel.sort() == rel.sort(), "Expected: {0}, actual: {1}".format(
        deep_rel, rel)
    assert deep_cmi.sort() == cmi.sort(), "Expected: {0}, actual: {1}".format(
        deep_cmi, cmi)

    # check admissible features values >= 0.1
    admissible_rel = infogram_model.get_admissible_relevance()
    admissible_cmi = infogram_model.get_admissible_cmi()
    for index in range(0, len(admissible_rel)):
        assert admissible_rel[index] >= 0.1, "Admissible relevance should equal or exceed 0.1 but is not.  Actual admissible" \
                                      " relevance is {0}".format(admissible_rel[index])
        assert admissible_cmi[index] >= 0.1, "Admissible cmi should equal or exceed 0.1 but is not.  Actual admissible cmi" \
                                      " is {0}".format(admissible_cmi[index])
def test_infogram_personal_loan_cv_valid():
    """
    Make sure safe infogram plot works with cv and validation dataset.
    """
    fr = h2o.import_file(path=pyunit_utils.locate("smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = ["Experience","Income","Family","CCAvg","Education","Mortgage",
         "Securities Account","CD Account","Online","CreditCard"]
    splits = fr.split_frame(ratios=[0.80])
    train = splits[0]
    test = splits[1]
    infogram_model_cv_v = H2OInfogram(seed = 12345, protected_columns=["Age","ZIP Code"], nfolds=5) 
    infogram_model_cv_v.train(x=x, y=target, training_frame=train, validation_frame=test) # cross-validation, validation
    infogram_model_cv_v.plot(title="Infogram calcuated from training dataset", server=True) # plot infogram from training dataset
    infogram_model_cv_v.plot(train=True, valid=True, title="Infogram calculated from training/validation dataset", 
                             server=True) # plot infogram from validation dataset
    infogram_model_cv_v.plot(train=True, valid=True, xval=True, title="Infogram calculated from "
                                                                      "training/validation/xval holdout dataset",
                             server=True) # plot infogram from cv hold out dataset
    relcmi_train = infogram_model_cv_v.get_admissible_score_frame()
    relcmi_valid = infogram_model_cv_v.get_admissible_score_frame(valid=True)
    assert relcmi_train.nrow==relcmi_valid.nrow
예제 #15
0
def test_infogram_breast_cancer_cv_fold_column():
    """
    Test to make sure cross-validation are implemented properly using fold_column
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/wdbc_changed.csv"))
    target = "diagnosis"
    fr[target] = fr[target].asfactor()

    x = [
        "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
        "smoothness_mean", "compactness_mean", "concavity_mean",
        "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
        "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
        "compactness_se", "concavity_se", "concave_points_se", "symmetry_se",
        "fractal_dimension_se", "radius_worst", "texture_worst",
        "perimeter_worst", "area_worst", "smoothness_worst",
        "compactness_worst", "concavity_worst", "concave_points_worst",
        "symmetry_worst", "fractal_dimension_worst"
    ]
    splits = fr.split_frame(ratios=[0.80])
    train = splits[0]
    test = splits[1]
    n_fold = 3
    infogram_model_cv_valid = H2OInfogram(
        seed=12345, top_n_features=50, nfolds=n_fold,
        fold_assignment="modulo")  # model with cross-validation
    infogram_model_cv_valid.train(x=x,
                                  y=target,
                                  training_frame=train,
                                  validation_frame=test)
    relcmi_train_cv_valid = infogram_model_cv_valid.get_admissible_score_frame(
    )
    relcmi_cv_cv_valid = infogram_model_cv_valid.get_admissible_score_frame(
        xval=True)
    relcmi_valid_cv_valid = infogram_model_cv_valid.get_admissible_score_frame(
        valid=True)

    infogram_model = H2OInfogram(seed=12345, top_n_features=50)
    infogram_model.train(x=x, y=target, training_frame=train)
    relcmi_train = infogram_model.get_admissible_score_frame()

    infogram_model_valid = H2OInfogram(seed=12345, top_n_features=50)
    infogram_model_valid.train(x=x,
                               y=target,
                               training_frame=train,
                               validation_frame=test)
    relcmi_train_valid = infogram_model_valid.get_admissible_score_frame()
    relcmi_valid_valid = infogram_model_valid.get_admissible_score_frame(
        valid=True)

    infogram_model_cv = H2OInfogram(seed=12345,
                                    top_n_features=50,
                                    nfolds=n_fold,
                                    fold_assignment="modulo")
    infogram_model_cv.train(x=x, y=target, training_frame=train)
    relcmi_train_cv = infogram_model_cv.get_admissible_score_frame()
    relcmi_cv_cv = infogram_model_cv.get_admissible_score_frame(xval=True)

    # training rel cmi frames should all equal
    print("Comparing infogram data from training dataset")
    pyunit_utils.compare_frames_local(relcmi_train_cv_valid,
                                      relcmi_train,
                                      prob=1)
    pyunit_utils.compare_frames_local(relcmi_train_cv,
                                      relcmi_train_valid,
                                      prob=1)
    pyunit_utils.compare_frames_local(relcmi_train_cv_valid,
                                      relcmi_train_cv,
                                      prob=1)

    # valid rel cmi frames should be the same
    print("Comparing infogram data from validation dataset")
    pyunit_utils.compare_frames_local(relcmi_valid_cv_valid,
                                      relcmi_valid_valid,
                                      prob=1)

    print("Comparing infogram data from cross-validation dataset")
    pyunit_utils.compare_frames_local(relcmi_cv_cv, relcmi_cv_cv_valid, prob=1)