예제 #1
0
def compare_weightedStats(model,
                          datafile,
                          xlist,
                          xname,
                          weightV,
                          pdpTDTable,
                          tol=1e-6):
    weightStat = manual_partial_dependence(
        model, datafile, xlist, xname,
        weightV)  # calculate theoretical weighted sts
    wMean = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpTDTable, "mean_response")  # stats for age predictor
    wStd = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpTDTable, "stddev_response")
    wStdErr = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpTDTable, "std_error_mean_response")
    pyunit_utils.equal_two_arrays(weightStat[0],
                                  wMean,
                                  tol,
                                  tol,
                                  throw_error=True)
    pyunit_utils.equal_two_arrays(weightStat[1],
                                  wStd,
                                  tol,
                                  tol,
                                  throw_error=True)
    pyunit_utils.equal_two_arrays(weightStat[2],
                                  wStdErr,
                                  tol,
                                  tol,
                                  throw_error=True)
def compare_weightedStats(model, datafile, xlist, xname, weightV, pdpTDTable, tol=1e-6):
    weightStat =  manual_partial_dependence(model, datafile, xlist, xname, weightV) # calculate theoretical weighted sts
    wMean = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpTDTable, "mean_response") # stats for age predictor
    wStd = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpTDTable, "stddev_response")
    wStdErr = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpTDTable, "std_error_mean_response")
    pyunit_utils.equal_two_arrays(weightStat[0], wMean, tol, tol, throwError=True)
    pyunit_utils.equal_two_arrays(weightStat[1], wStd, tol, tol, throwError=True)
    pyunit_utils.equal_two_arrays(weightStat[2], wStdErr, tol, tol, throwError=True)
def partial_plot_test():
    # Import data set that contains NAs
    data = h2o.import_file(pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv'))
    x = data.names
    y = 'CAPSULE'
    x.remove(y)

    weights = h2o.H2OFrame([3.0]*data.nrow)
    tweight2 = [1.0]*data.nrow
    random.seed(12345)
    for ind in range(len(tweight2)):
        tweight2[ind] = random.randint(0,5)
    weights2 = h2o.H2OFrame(tweight2)
    data = data.cbind(weights)
    data = data.cbind(weights2)
    data.set_name(data.ncol-2, "constWeight")
    data.set_name(data.ncol-1, "variWeight")

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    # pdp without weight or NA
    pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE'],server=True, plot=True)
    # pdp with constant weight and NA
    pdpcWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True,
                                     weight_column="constWeight", include_na=True)

    # compare results
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[0], pdpcWNA[0], pdpOrig[0].col_header, tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[1], pdpcWNA[1], pdpOrig[1].col_header, tolerance=1e-10)
    # pdp with changing weight NA
    pdpvWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True,
                                     weight_column="variWeight", include_na=True)
    ageList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[0], "age")
    raceList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[1], "race")
    raceList.remove(raceList[2])
    raceList.append(data[21,"RACE"]) # replace with NA word
    ageList[len(ageList)-1] = float('nan') # replace nan with proper form for python

    compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', raceList, "RACE", tweight2, pdpvWNA[1], tol=1e-10)
    compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', ageList, "AGE", tweight2, pdpvWNA[0], tol=1e-10)
예제 #4
0
def cv_nfolds_sd_check():
    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate[1] = prostate[1].asfactor()
    prostate.summary()

    prostate_gbm = H2OGradientBoostingEstimator(nfolds=4,
                                                distribution="bernoulli")
    prostate_gbm.train(x=list(range(2, 9)), y=1, training_frame=prostate)
    prostate_gbm.show()
    prostate_gbm.model_performance(xval=True)
    # extract mean and std column calculated by cross-validation metric
    meanCol = pyunit_utils.extract_col_value_H2OTwoDimTable(
        prostate_gbm._model_json["output"]["cross_validation_metrics_summary"],
        "mean")
    stdCol = pyunit_utils.extract_col_value_H2OTwoDimTable(
        prostate_gbm._model_json["output"]["cross_validation_metrics_summary"],
        "sd")
    # extract actual values from all folds
    cv1 = pyunit_utils.extract_col_value_H2OTwoDimTable(
        prostate_gbm._model_json["output"]["cross_validation_metrics_summary"],
        "cv_1_valid")
    cv2 = pyunit_utils.extract_col_value_H2OTwoDimTable(
        prostate_gbm._model_json["output"]["cross_validation_metrics_summary"],
        "cv_2_valid")
    cv3 = pyunit_utils.extract_col_value_H2OTwoDimTable(
        prostate_gbm._model_json["output"]["cross_validation_metrics_summary"],
        "cv_3_valid")
    cv4 = pyunit_utils.extract_col_value_H2OTwoDimTable(
        prostate_gbm._model_json["output"]["cross_validation_metrics_summary"],
        "cv_4_valid")
    cvVals = [cv1, cv2, cv3, cv4]
    assertMeanSDCalculation(
        meanCol, stdCol, cvVals
    )  # compare manual mean/std calculation from cross-validation calculation
def partial_plot_test():
    # Import data set that contains NAs

    data = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"])
    test = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"])
    x = data.names
    y = "IsDepDelayed"
    data[y] = data[y]
    x.remove(y)
    x.remove("Weight")
    x.remove("IsDepDelayed_REC")
    WC = "Weight"

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=80, learn_rate=0.1, seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    # pdp with weight and no NA
    pdpw = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False,
                                  weight_column=WC)

    # pdp with weight and NA
    pdpwNA = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False,
                                    weight_column=WC, include_na = True)
    input_miss_list = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpwNA[0], "input_miss")
    assert math.isnan(input_miss_list[-1]), "Expected last element to be nan but is not."
    distance_list = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpwNA[1], "distance")
    assert math.isnan(distance_list[-1]), "Expected last element to be nan but is not."
    # compare pdpw with pdpwNA, they should equal upto NA since the pdpw does not have NAs.
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[0], pdpwNA[0], pdpw[0].col_header, tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[1], pdpwNA[1], pdpw[1].col_header, tolerance=1e-10)

    # compare pdpwNA with theoretical results
    pyunit_utils.compare_weightedStats(gbm_model, test, input_miss_list, "Input_miss",
                                       test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[0], tol=1e-10)
    pyunit_utils.compare_weightedStats(gbm_model, test, distance_list, "Distance",
                                       test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[1], tol=1e-10)
def test_anova_table_frame():
  train = h2o.import_file(path=pyunit_utils.locate("smalldata/anovaGlm/Moore.csv"))
  y = 'conformity'
  x = ['fcategory', 'partner.status']

  model = H2OANOVAGLMEstimator(family='gaussian', lambda_=0, save_transformed_framekeys=True)
  model.train(x=x, y=y, training_frame=train)
  anova_table = model.result()
  # compare model summary and anova table frame
  colNames = anova_table.names
  for name in colNames:
    summaryCol = pyunit_utils.extract_col_value_H2OTwoDimTable(model._model_json["output"]["model_summary"], name)
    for ind in range(0, anova_table.nrow):
      if anova_table[name].isnumeric()[0]:
        assert abs(summaryCol[ind]-anova_table[name][ind,0]) < 1e-6, "expected value: {0}, actual value: {1} and they" \
                                                                     " are different.".format(summaryCol[ind], 
                                                                                              anova_table[name][ind,0])
      else:
        assert summaryCol[ind]==anova_table[name][ind,0], "expected value: {0}, actual value: {1} and they are" \
                                                          " different.".format(summaryCol[ind], anova_table[name][ind,0])
def partial_plot_test():
    # Import data set that contains NAs

    data = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"),
        na_strings=["NA"])
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"),
        na_strings=["NA"])
    x = data.names
    y = "IsDepDelayed"
    data[y] = data[y]
    x.remove(y)
    x.remove("Weight")
    x.remove("IsDepDelayed_REC")
    WC = "Weight"

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=80,
                                             learn_rate=0.1,
                                             seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    # pdp with weight and no NA
    pdpw = gbm_model.partial_plot(data=test,
                                  cols=["Input_miss", "Distance"],
                                  server=True,
                                  plot=False,
                                  weight_column=WC)

    # pdp with weight and NA
    pdpwNA = gbm_model.partial_plot(data=test,
                                    cols=["Input_miss", "Distance"],
                                    server=True,
                                    plot=False,
                                    weight_column=WC,
                                    include_na=True)
    input_miss_list = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpwNA[0], "input_miss")
    assert math.isnan(
        input_miss_list[-1]), "Expected last element to be nan but is not."
    distance_list = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpwNA[1], "distance")
    assert math.isnan(
        distance_list[-1]), "Expected last element to be nan but is not."
    # compare pdpw with pdpwNA, they should equal upto NA since the pdpw does not have NAs.
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[0],
                                                  pdpwNA[0],
                                                  pdpw[0].col_header,
                                                  tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[1],
                                                  pdpwNA[1],
                                                  pdpw[1].col_header,
                                                  tolerance=1e-10)

    # compare pdpwNA with theoretical results
    pyunit_utils.compare_weightedStats(gbm_model,
                                       test,
                                       input_miss_list,
                                       "Input_miss",
                                       test[WC].as_data_frame(use_pandas=False,
                                                              header=False),
                                       pdpwNA[0],
                                       tol=1e-10)
    pyunit_utils.compare_weightedStats(gbm_model,
                                       test,
                                       distance_list,
                                       "Distance",
                                       test[WC].as_data_frame(use_pandas=False,
                                                              header=False),
                                       pdpwNA[1],
                                       tol=1e-10)
예제 #8
0
def partial_plot_test():
    # Import data set that contains NAs
    data = h2o.import_file(
        pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv'))
    x = data.names
    y = 'CAPSULE'
    x.remove(y)

    weights = h2o.H2OFrame([3.0] * data.nrow)
    tweight2 = [1.0] * data.nrow
    random.seed(12345)
    for ind in range(len(tweight2)):
        tweight2[ind] = random.randint(0, 5)
    weights2 = h2o.H2OFrame(tweight2)
    data = data.cbind(weights)
    data = data.cbind(weights2)
    data.set_name(data.ncol - 2, "constWeight")
    data.set_name(data.ncol - 1, "variWeight")

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=50,
                                             learn_rate=0.05,
                                             seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    # pdp without weight or NA
    pdpOrig = gbm_model.partial_plot(data=data,
                                     cols=['AGE', 'RACE'],
                                     server=True,
                                     plot=True)
    # pdp with constant weight and NA
    pdpcWNA = gbm_model.partial_plot(data=data,
                                     cols=['AGE', 'RACE'],
                                     server=True,
                                     plot=True,
                                     weight_column="constWeight",
                                     include_na=True)

    # compare results
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[0],
                                                  pdpcWNA[0],
                                                  pdpOrig[0].col_header,
                                                  tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[1],
                                                  pdpcWNA[1],
                                                  pdpOrig[1].col_header,
                                                  tolerance=1e-10)
    # pdp with changing weight NA
    pdpvWNA = gbm_model.partial_plot(data=data,
                                     cols=['AGE', 'RACE'],
                                     server=True,
                                     plot=True,
                                     weight_column="variWeight",
                                     include_na=True)
    ageList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[0], "age")
    raceList = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpvWNA[1], "race")
    raceList.remove(raceList[2])
    raceList.append(data[21, "RACE"])  # replace with NA word
    ageList[len(ageList) - 1] = float(
        'nan')  # replace nan with proper form for python

    compare_weightedStats(gbm_model,
                          'smalldata/prostate/prostate_cat_NA.csv',
                          raceList,
                          "RACE",
                          tweight2,
                          pdpvWNA[1],
                          tol=1e-10)
    compare_weightedStats(gbm_model,
                          'smalldata/prostate/prostate_cat_NA.csv',
                          ageList,
                          "AGE",
                          tweight2,
                          pdpvWNA[0],
                          tol=1e-10)