def setup_data(self):
        """
        This function performs all initializations necessary:
        load the data sets and set the training set indices and response column index
        """

        # create and clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)
        
        # randomly choose which family of GBM algo to use
        self.family = self.families[random.randint(0, len(self.families)-1)]

        # preload datasets, set x_indices, y_index and change response to factor for classification
        if 'multinomial' in self.family:
            self.training_metric = 'logloss'
            self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[1]))
            self.y_index = self.training1_data.ncol-1
            self.x_indices = list(range(self.y_index))
            self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor()
            self.scale_model = 1

        else:
            self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[0]))
            self.y_index = self.training1_data.ncol-1
            self.x_indices = list(range(self.y_index))
            self.scale_model = 0.75

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def fiftycatRF():
    
    

    # Training set has only 45 categories cat1 through cat45
    #Log.info("Importing 50_cattest_train.csv data...\n")
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv"))
    train["y"] = train["y"].asfactor()

    #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
    #train.summary()

    # Train H2O DRF Model:
    #Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
    model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500)

    # Test dataset has all 50 categories cat1 through cat50
    #Log.info("Importing 50_cattest_test.csv data...\n")
    test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv"))

    #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
    #test.summary()

    # Predict on test dataset with DRF model:
    #Log.info("Performing predictions on test dataset...\n")
    preds = model.predict(test)
    preds.head()

    # Get the confusion matrix and AUC
    #Log.info("Confusion matrix of predictions (max accuracy):\n")
    perf = model.model_performance(test)
    perf.show()
    cm = perf.confusion_matrix()
    print(cm)
def wide_dataset_large():
  print("Reading in Arcene training data for binomial modeling.")
  trainDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
  trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
  trainDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
  xtrain = np.transpose(trainDataFeatures).tolist()
  ytrain = trainDataResponse.tolist()
  trainData = h2o.H2OFrame.fromPython([ytrain]+xtrain)

  trainData[0] = trainData[0].asfactor()

  print("Run model on 3250 columns of Arcene with strong rules off.")
  model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=False, alpha=1)
  model.train(x=range(1,3250), y=0, training_frame=trainData)

  print("Test model on validation set.")
  validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
  validDataResponse = np.where(validDataResponse == -1, 0, 1)
  validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
  xvalid = np.transpose(validDataFeatures).tolist()
  yvalid = validDataResponse.tolist()
  validData = h2o.H2OFrame.fromPython([yvalid]+xvalid)
  prediction = model.predict(validData)

  print("Check performance of predictions.")
  performance = model.model_performance(validData)

  print("Check that prediction AUC better than guessing (0.5).")
  assert performance.auc() > 0.5, "predictions should be better then pure chance"
def consistency_check():

    try:
        small = pyunit_utils.locate("h2o-py/demos/citi_bike_small.ipynb")
    except ValueError:
        small = pyunit_utils.locate("h2o-py/demos/citi_bike_small_NOPASS.ipynb")

    try:
        large = pyunit_utils.locate("h2o-py/demos/citi_bike_large.ipynb")
    except ValueError:
        large = pyunit_utils.locate("h2o-py/demos/citi_bike_large_NOPASS.ipynb")

    results_dir = pyunit_utils.locate("results")
    s = os.path.join(results_dir, os.path.basename(small).split('.')[0]+".py")
    l = os.path.join(results_dir, os.path.basename(large).split('.')[0]+".py")

    from tests import pydemo_utils
    pydemo_utils.ipy_notebook_exec(small, save_and_norun = s)
    pydemo_utils.ipy_notebook_exec(large, save_and_norun = l)

    small_list = list(open(s, 'r'))
    large_list = list(open(l, 'r'))

    for s, l in zip(small_list, large_list):
        if s != l:
            assert s == "data = h2o.import_file(path=small_test)\n" and \
                   l != "data = h2o.import_file(path=large_test)\n", \
                "This difference is not allowed between the small and large citibike demos.\nCitibike small: {0}" \
                "Citibike large: {1}".format(s,l)
예제 #5
0
def test_relevel():
    #First, compare againts itself
    print("Importing prostate_cat.csv data...\n")
    d = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA","NA","NA","NA","NA","NA","NA","NA"])

    mh2o1 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip")
    mh2o1.train(x=list(range(1, d.ncol)), y=0, training_frame=d)
    ns = mh2o1.coef().keys()
    print(ns)
    assert("DPROS.None" in ns, "None level IS NOT expected to be skipped by default")
    assert(("DPROS.Both" not in ns), "Both level IS expected to be skipped by default")
    x = d["DPROS"].relevel("None")
    print(x)
    d["DPROS"] = x[0]

    mh2o2 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip")
    mh2o2.train(x=list(range(1, d.ncol)), y=0, training_frame=d)
    ns2 = mh2o2.coef().keys()
    print(ns2)
    assert("DPROS.None" in ns2, "None level IS NOT expected to be skipped by default")
    assert(("DPROS.Both" not in ns2), "Both level IS expected to be skipped by default")

    #Second, compare against R input (taken from runit_relevel.R)
    dr = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    dr["DPROS"] = d["DPROS"].relevel("None")
    #Results are from R but manualy reordered and renamed to match h2o naming and order
    exp_coefs = {"Intercept": -7.63245 , "DPROS.Both": 1.39185, "DPROS.Left": 0.73482, "DPROS.Right": 1.51437, "RACE.White": 0.65160, "DCAPS.Yes": 0.49233,
                 "AGE":-0.01189 , "PSA": 0.02990, "VOL": -0.01141, "GLEASON": 0.96466927}
    coeff_diff = {key: abs(exp_coefs[key] - mh2o2.coef().get(key, 0)) for key in exp_coefs.keys()}
    assert (max(coeff_diff.values()) < 1e-4)
def xgboost_milsongs_gaussian_medium():
    assert H2OXGBoostEstimator.available()

    # Import big dataset to ensure run across multiple nodes
    training_frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    test_frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    x = list(range(1,training_frame.ncol))
    y = 0

    # Model with maximum of 2 trees
    model_2_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.3,
                                        booster='gbtree', seed=1, ntrees=2, distribution='gaussian')
    model_2_trees.train(x=x, y=y, training_frame=training_frame)
    prediction_2_trees = model_2_trees.predict(test_frame)

    assert prediction_2_trees.nrows == test_frame.nrows

    # Model with 10 trees
    model_10_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.3,
                                         booster='gbtree', seed=1, ntrees=10, distribution='gaussian')
    model_10_trees.train(x=x, y=y, training_frame=training_frame)
    prediction_10_trees = model_10_trees.predict(test_frame)

    assert prediction_10_trees.nrows == test_frame.nrows

    ## Mean square error on model with lower number of decision trees should be higher
    assert model_2_trees.mse() > model_10_trees.mse()
def get_modelKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    # benign_h2o.summary()

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = H2OKMeansEstimator(k=i)
        km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
예제 #8
0
def benignKmeans():
  # Connect to a pre-existing cluster
  # connect to localhost:54321


  #  Log.info("Importing benign.csv data...\n")
  benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
  #benign_h2o.summary()

  benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
  # Impute missing values with column mean
  imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
  benign_sci = imp.fit_transform(benign_sci)

  # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))

  from h2o.estimators.kmeans import H2OKMeansEstimator

  for i in range(1,7):
    benign_h2o_km = H2OKMeansEstimator(k=i)
    benign_h2o_km.train(x = range(benign_h2o.ncol), training_frame=benign_h2o)
    print "H2O centers"
    print benign_h2o_km.centers()

    benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1)
    benign_sci_km.fit(benign_sci)
    print "sckit centers"
    print benign_sci_km.cluster_centers_
def iris_h2o_vs_sciKmeans():
  # Connect to a pre-existing cluster
    # connect to localhost:54321

  iris_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
  iris_sci = np.genfromtxt(pyunit_utils.locate("smalldata/iris/iris.csv"), delimiter=',')
  iris_sci = iris_sci[:,0:4]

  s =[[4.9,3.0,1.4,0.2],
  [5.6,2.5,3.9,1.1],
  [6.5,3.0,5.2,2.0]]

  start = h2o.H2OFrame(s)

  h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start, standardize=False)

  sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
  sci_km.fit(iris_sci)

  # Log.info("Cluster centers from H2O:")
  print("Cluster centers from H2O:")
  h2o_centers = h2o_km.centers()
  print(h2o_centers)

  # Log.info("Cluster centers from scikit:")
  print("Cluster centers from scikit:")
  sci_centers = sci_km.cluster_centers_.tolist()

  print(sci_centers)

  for hcenter, scenter in zip(h2o_centers, sci_centers):
    for hpoint, spoint in zip(hcenter,scenter):
      assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
예제 #10
0
def table_check():
  df = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  print(df[['AGE','RACE']].table(dense=True).head().as_data_frame(True))
  print(df[['AGE','RACE']].table(dense=False).head().as_data_frame(True))
  print(df[['RACE','AGE']].table(dense=True).head().as_data_frame(True))
  print(df[['RACE','AGE']].table(dense=False).head().as_data_frame(True))
  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

  # single column (frame)
  table1 = iris["C5"].table()
  assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1])
  assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1])
  assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1])

  # two-column (one argument)
  
  #dense
  table2 = iris["C1"].table(iris["C5"])
  
  #not dense
  table3 = iris["C1"].table(iris["C5"],dense=False)
  
  #check same value
  assert (table3[table3['C1'] == 5,'Iris-setosa'] == table2[(table2['C1'] == 5) & (table2['C5'] == 'Iris-setosa'),'Counts']).all()
  
  assert (table2 == iris[["C1","C5"]].table()).all()
  assert (table3 == iris[["C1","C5"]].table(dense=False)).all()

  cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
  table = cars[2].table().as_data_frame()
  table = dict(table[1:])
  table = {k:int(v) for k,v in list(table.items())}
  expected = Counter(itertools.chain(*cars[2].as_data_frame()[1:]))
  assert table == expected, "Expected {} for table counts but got {}".format(expected, table)
def offsets_and_distributions():

    # cars
    cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame.fromPython([[.5] for x in range(398)])
    offset.set_name(0,"x1")
    cars = cars.cbind(offset)

    # insurance
    insurance = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()

    # bernoulli - offset not supported
    #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1",
    #                       training_frame=cars)
    #predictions = dl.predict(cars)

    # gamma
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # gaussian
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # poisson
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # tweedie
    dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)
예제 #12
0
def col_names_check():

  iris_wheader = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
      "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
                                                         iris_wheader.col_names)

  iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
  assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
                                                         "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)

  df = h2o.H2OFrame.from_python(list(zip(*np.random.randn(100,4).tolist())), column_names=list("ABCD"), column_types=["enum"]*4)
  df.head()
  assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names)
  assert list(df.types.values()) == ["enum"]*4, "Expected {} for column types but got {}".format(["enum"]*4, df.types)

  df = h2o.H2OFrame(list(zip(*np.random.randn(100,4).tolist())))
  df.head()
  assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"]
                                                                                                 , df.col_names)
  assert list(df.types.values()) == ["real"]*4, "Expected {} for column types but got {}".format(["real"]*4, df.types)

  df = h2o.H2OFrame({'B': ['a', 'a', 'b', 'NA', 'NA']})
  df.head()
  assert df.col_names == ["B"], "Expected {} for column names but got {}".format(["B"], df.col_names)

  df = h2o.H2OFrame.from_python({'B': ['a', 'a', 'b', 'NA', 'NA']}, column_names=["X"])
  df.head()
  assert df.col_names == ["X"], "Expected {} for column names but got {}".format(["X"], df.col_names)
예제 #13
0
def user():

    a = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4]
    a.head()

    print(a[0].names)  # Column header
    print(a[2,0])           # column 0, row 2 value
    print(a[2,"sepal_len"]) # Column 0, row 2 value
    (a[0] + 2).show()  # Add 2 to every element; broadcast a constant
    (a[0] + a[1]).show()  # Add 2 columns; broadcast parallel add
    sum(a).show()
    print(a["sepal_len"].mean())

    print()
    print("Rows 50 through 77 in the `sepal_len` column")
    a[50:78, "sepal_len"].show()  # print out rows 50 thru 77 inclusive
    print()

    a["sepal_len"].show()

    print(a[50:78, ["sepal_len", "sepal_wid"]].show())

    a.show()

    print("The column means: ")
    print(a.mean())
    print()

    try:
        print(a["Sepal_len"].dim)  # Error, misspelt column name
    except Exception:
        pass  # Expected error

    b = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4]
    c = a + b
    d = c + c + sum(a)
    e = c + a + 1
    e.show()
    # Note that "d=c+..." keeps the internal C expressions alive, until "d" goes
    # out of scope even as we nuke "c"
    c.show()
    c = None
    # Internal "ExprNode(c=a+b)" not dead!

    print(1 + (a[0] + b[1]).mean())

    import collections

    c = h2o.H2OFrame(collections.OrderedDict({"A": [1, 2, 3], "B": [4, 5, 6]}))
    c.show()

    c.describe()
    c.head()

    c[0].show()
    print(c[1,0])
    c[0:2,0].show()

    sliced = a[0:51,0]
    sliced.show()
def checkpoint_new_category_in_predictor():

  sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
  sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
  vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv"))
  print("checkpoint_new_category_in_predictor-1")
  m1 = H2ODeepLearningEstimator(epochs=100)
  m1.train(x=[0,1,2,4], y=3, training_frame=sv1)

  m2 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
  m2.train(x=[0,1,2,4], y=3, training_frame=sv2)
  print("checkpoint_new_category_in_predictor-2")

  # attempt to continue building model, but with an expanded categorical predictor domain.
  # this should fail
  try:
    m3 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
    m3.train(x=[0,1,2,4], y=3, training_frame=vir)
    assert False, "Expected continued model-building to fail with new categories introduced in predictor"
  except EnvironmentError:
    pass
  
  print("checkpoint_new_category_in_predictor-3")

  # attempt to predict on new model, but with observations that have expanded categorical predictor domain.
  predictions = m2.predict(vir)
  print("checkpoint_new_category_in_predictor-4")
예제 #15
0
def stackedensemble_metalearner_seed_test():

    # Import training set
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
                            destination_frame="higgs_train_5k")
    test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"),
                           destination_frame="higgs_test_5k")

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # Convert response to a factor
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # Set number of folds for base learners
    nfolds = 3

    #Metalearner params for gbm, drf, glm, and deep deeplearning
    gbm_params = {"sample_rate" : 0.3, "col_sample_rate" : 0.3}

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                          ntrees=10,
                                          nfolds=nfolds,
                                          keep_cross_validation_predictions=True,
                                          seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     nfolds=nfolds,
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    #Train two SE models with same metalearner seeds
    stack_gbm1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                            metalearner_params = gbm_params, seed = 55555)
    stack_gbm2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                            metalearner_params = gbm_params, seed = 55555)
    stack_gbm1.train(x=x, y=y, training_frame=train)
    stack_gbm2.train(x=x, y=y, training_frame=train)
    meta_gbm1 = h2o.get_model(stack_gbm1.metalearner()['name'])
    meta_gbm2 = h2o.get_model(stack_gbm2.metalearner()['name'])

    assert meta_gbm1.rmse(train=True) == meta_gbm2.rmse(train=True), "RMSE should match if same seed"

    #Train two SE models with diff metalearner seeds
    stack_gbm3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                             metalearner_params = gbm_params, seed = 55555)
    stack_gbm4 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                             metalearner_params = gbm_params, seed = 98765)
    stack_gbm3.train(x=x, y=y, training_frame=train)
    stack_gbm4.train(x=x, y=y, training_frame=train)
    meta_gbm3 = h2o.get_model(stack_gbm3.metalearner()['name'])
    meta_gbm4 = h2o.get_model(stack_gbm4.metalearner()['name'])
    assert meta_gbm3.rmse(train=True) != meta_gbm4.rmse(train=True), "RMSE should NOT match if diff seed"
예제 #16
0
def smallcat_gbm():
  # Training set has 26 categories from A to Z
  # Categories A, C, E, G, ... are perfect predictors of y = 1
  # Categories B, D, F, H, ... are perfect predictors of y = 0

  alphabet = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"))
  alphabet["y"] = alphabet["y"].asfactor()
  #Log.info("Summary of alphabet_cattest.csv from H2O:\n")
  #alphabet.summary()

  # Prepare data for scikit use
  trainData = np.loadtxt(pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0:lambda s: ord(s.decode().split("\"")[1])})
  trainDataResponse = trainData[:,1]
  trainDataFeatures = trainData[:,0]

  # Train H2O GBM Model:

  gbm_h2o = H2OGradientBoostingEstimator(distribution="bernoulli",
                                         ntrees=1,
                                         max_depth=1,
                                         nbins=100)
  gbm_h2o.train(x="X",y="y", training_frame=alphabet)
  gbm_h2o.show()

  # Train scikit GBM Model:
  # Log.info("scikit GBM with same parameters:")
  gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None)
  gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
def glrm_catagorical_bug_fix():
    trainData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    testData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    glrmModel = H2OGeneralizedLowRankEstimator(k=4)
    glrmModel.train(x=trainData.names, training_frame=trainData)
    predV = glrmModel.predict(testData)
    print(predV)
예제 #18
0
def dim_checks():
  
  

  # Log.info("Uploading logreg/princeton/cuse.dat")
  h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
  np_data = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate.csv"), delimiter=',', skiprows=1)

  h2o_rows, h2o_cols = h2o_data.dim
  np_rows, np_cols = list(np_data.shape)

  print('The dimensions of h2o frame is: {0} x {1}'.format(h2o_rows, h2o_cols))
  print('The dimensions of numpy array is: {0} x {1}'.format(np_rows, np_cols))

  assert [h2o_rows, h2o_cols] == [np_rows, np_cols], "expected equal number of columns and rows"

  # Log.info("Slice out a column and data frame it, try dim on it...")

  h2o_slice = h2o_data[4]
  np_slice = np_data[:,4]

  h2o_rows, h2o_cols = h2o_slice.dim
  np_rows = np_slice.shape[0]

  print('The dimensions of h2o column slice is: {0} x {1}'.format(h2o_rows, h2o_cols))
  print('The dimensions of numpy array column slice is: {0} x 1'.format(np_rows))

  assert [h2o_rows, h2o_cols] == [np_rows, 1], "expected equal number of columns and rows"

  # Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...")

  h2oColAmpFive = h2o_slice & 5

  assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged"
def link_functions_gaussian():
  print("Read in prostate data.")
  h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))
  h2o_data.head()

  sm_data = pd.read_csv(zipfile.ZipFile(pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")).
                        open("prostate_complete.csv")).as_matrix()
  sm_data_response = sm_data[:,9]
  sm_data_features = sm_data[:,1:9]

  print("Testing for family: GAUSSIAN")
  print("Set variables for h2o.")
  myY = "GLEASON"
  myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]

  print("Create models with canonical link: IDENTITY")
  h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0)
  h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
  sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features,
                    family=sm.families.Gaussian(sm.families.links.identity)).fit()

  print("Compare model deviances for link function identity")
  h2o_deviance = old_div(h2o_model.residual_deviance(), h2o_model.null_deviance())
  sm_deviance = old_div(sm_model.deviance, sm_model.null_deviance)
  assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def xgboost_insurance_gaussian_small():
    assert H2OXGBoostEstimator.available()

    # Import big dataset to ensure run across multiple nodes
    training_frame = h2o.import_file(pyunit_utils.locate("smalldata/testng/insurance_train1.csv"))
    test_frame = h2o.import_file(pyunit_utils.locate("smalldata/testng/insurance_validation1.csv"))
    x = ['Age', 'District']
    y = 'Claims'

    # Model with maximum of 2 trees
    model_2_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.7,
                                        booster='gbtree', seed=1, ntrees=2, distribution='gaussian')
    model_2_trees.train(x=x, y=y, training_frame=training_frame)
    prediction_2_trees = model_2_trees.predict(test_frame)

    assert prediction_2_trees.nrows == test_frame.nrows

    # Model with 10 trees
    model_10_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.7,
                                         booster='gbtree', seed=1, ntrees=10, distribution='gaussian')
    model_10_trees.train(x=x, y=y, training_frame=training_frame)
    prediction_10_trees = model_10_trees.predict(test_frame)

    assert prediction_10_trees.nrows == test_frame.nrows

    ## Mean square error on model with lower number of decision trees should be higher
    assert model_2_trees.mse() > model_10_trees.mse()
예제 #21
0
def plot_test():
    
    
    kwargs = {}
    kwargs['server'] = True

    air = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    # Constructing test and train sets by sampling (20/80)
    s = air[0].runif()
    air_train = air[s <= 0.8]
    air_valid = air[s > 0.8]

    myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
    myY = "IsDepDelayed"

    air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY],
                      distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01)

    # Plot ROC for training and validation sets
    air_gbm.plot(type="roc", train=True, **kwargs)
    air_gbm.plot(type="roc", valid=True, **kwargs)

    air_test = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    perf = air_gbm.model_performance(air_test)

    #Plot ROC for test set
    perf.plot(type="roc", **kwargs)
def anomaly():
  print("Deep Learning Anomaly Detection MNIST")

  train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
  test = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))

  predictors = list(range(0,784))
  resp = 784

  # unsupervised -> drop the response column (digit: 0-9)
  train = train[predictors]
  test = test[predictors]

  # 1) LEARN WHAT'S NORMAL
  # train unsupervised Deep Learning autoencoder model on train_hex

  ae_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[2], l1=1e-5, ignore_const_cols=False, epochs=1)
  ae_model.train(x=predictors,training_frame=train)

  # 2) DETECT OUTLIERS
  # anomaly app computes the per-row reconstruction error for the test data set
  # (passing it through the autoencoder model and computing mean square error (MSE) for each row)
  test_rec_error = ae_model.anomaly(test)

  # 3) VISUALIZE OUTLIERS
  # Let's look at the test set points with low/median/high reconstruction errors.
  # We will now visualize the original test set points and their reconstructions obtained
  # by propagating them through the narrow neural net.

  # Convert the test data into its autoencoded representation (pass through narrow neural net)
  test_recon = ae_model.predict(test)
def fiftycatGBM():
  
  

  # Training set has only 45 categories cat1 through cat45
  #Log.info("Importing 50_cattest_train.csv data...\n")
  train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv"))
  train["y"] = train["y"].asfactor()

  #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
  #train.summary()
  
  # Train H2O GBM Model:
  #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
  model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20)
  model.show()
 
  # Test dataset has all 50 categories cat1 through cat50
  #Log.info("Importing 50_cattest_test.csv data...\n")
  test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv"))
  #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
  #test.summary()
  
  # Predict on test dataset with GBM model:
  #Log.info("Performing predictions on test dataset...\n")
  predictions = model.predict(test)
  predictions.show()
  
  # Get the confusion matrix and AUC
  #Log.info("Confusion matrix of predictions (max accuracy):\n")
  performance = model.model_performance(test)
  test_cm = performance.confusion_matrix()
  test_auc = performance.auc()
def shuffling_large():
  print("Reading in Arcene training data for binomial modeling.")
  train_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene.csv"))
  train_data_shuffled = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))


  print("Create model on original Arcene dataset.")
  h2o_model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
  h2o_model.train(x=list(range(1000)), y=1000, training_frame=train_data)

  print("Create second model on original Arcene dataset.")
  h2o_model_2 = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
  h2o_model_2.train(x=list(range(1000)), y=1000, training_frame=train_data)

  print("Create model on shuffled Arcene dataset.")
  h2o_model_s = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
  h2o_model_s.train(x=list(range(1000)), y=1000, training_frame=train_data_shuffled)

  print("Assert that number of predictors remaining and their respective coefficients are equal.")

  for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2.
          _model_json['output']['coefficients_table'].cell_values):
    assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
    if isinstance(x[1],float):
      assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
    if isinstance(x[2],float):
      assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

  for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s.
          _model_json['output']['coefficients_table'].cell_values):
    assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
    if isinstance(x[1],float):
      assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
    if isinstance(x[2],float):
      assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
예제 #25
0
def minmax_basic():
    print "Uploading iris data..."
    iris_h2o = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    iris_np = np.genfromtxt(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"), delimiter=",", skip_header=1)

    print "Computing min & max of the first column of iris..."
    iris1_min = iris_h2o[0].min()
    print "Minimum: {0}".format(iris1_min)
    iris1_max = iris_h2o[0].max()
    print "Maximum: {0}".format(iris1_max)
    np_min = iris_np[:,0].min()
    np_max = iris_np[:,0].max()
    assert iris1_min == np_min, "Expected the same min value. H2O got {0}, but numpy got {1}".format(iris1_min, np_min)
    assert iris1_max == np_max, "Expected the same max value. H2O got {0}, but numpy got {1}".format(iris1_max, np_max)

    print "Computing min & max of all numeric columns of iris..."
    irisall_min = iris_h2o[0:4].min()
    print "Minimum: {0}".format(irisall_min)
    irisall_max = iris_h2o[0:4].max()
    print "Maximum: {0}".format(irisall_max)
    np_min = iris_np[:,0:4].min()
    np_max = iris_np[:,0:4].max()
    assert irisall_min == np_min, "Expected the same min value. H2O got {0}, but numpy got {1}".format(irisall_min, np_min)
    assert irisall_max == np_max, "Expected the same max value. H2O got {0}, but numpy got {1}".format(irisall_max, np_max)

    print "min and max correctness"
    data = [1,-0.1,0]
    mn = min(data)
    mx = max(data)
    h2o_min = h2o.H2OFrame(data).min()
    h2o_max = h2o.H2OFrame(data).max()
    assert h2o_min == mn, "Expected the same min value. H2O got {0}, but python got {1}".format(h2o_min, mn)
    assert h2o_max == mx, "Expected the same max value. H2O got {0}, but python got {1}".format(h2o_max, mx)
예제 #26
0
def export_file():
    pros_hex = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    pros_hex[1] = pros_hex[1].asfactor()
    pros_hex[3] = pros_hex[3].asfactor()
    pros_hex[4] = pros_hex[4].asfactor()
    pros_hex[5] = pros_hex[5].asfactor()
    pros_hex[8] = pros_hex[8].asfactor()

    p_sid = pros_hex.runif()
    pros_train = pros_hex[p_sid > 0.2, :]
    pros_test = pros_hex[p_sid <= 0.2, :]

    glm = H2OGeneralizedLinearEstimator(family="binomial")
    myglm = glm.train(x=list(range(2, pros_hex.ncol)), y=1, training_frame=pros_train)
    mypred = glm.predict(pros_test)

    def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
        return "".join(random.choice(chars) for _ in range(size))

    fname = id_generator() + "_prediction.csv"

    path = pyunit_utils.locate("results")
    dname = path + "/" + fname

    h2o.export_file(mypred, dname)

    py_pred = pd.read_csv(dname)
    print(py_pred.head())
    h_pred = mypred.as_data_frame(True)
    print(h_pred.head())

    # Test to check if py_pred & h_pred are identical
    assert_frame_equal(py_pred, h_pred)
def wide_dataset_large():



    print("Reading in Arcene training data for binomial modeling.")
    trainDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
    trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
    trainDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
    trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())

    print("Run model on 3250 columns of Arcene with strong rules off.")
    model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1])

    print("Test model on validation set.")
    validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
    validDataResponse = np.where(validDataResponse == -1, 0, 1)
    validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
    validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
    prediction = model.predict(validData)

    print("Check performance of predictions.")
    performance = model.model_performance(validData)

    print("Check that prediction AUC better than guessing (0.5).")
    assert performance.auc() > 0.5, "predictions should be better then pure chance"
예제 #28
0
def bigcat_gbm():
    covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtype[54] = covtype[54].asfactor()
    covtypeTest = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtypeTest[54] = covtype[54].asfactor()

    regular = H2OGradientBoostingEstimator(ntrees=10, seed=1234)
    regular.train(x=list(range(54)), y=54, training_frame=covtype)

    # do prediction on original dataset, no warnings
    check_warnings(regular, 0, covtypeTest)
    # drop response, no warnings
    covtypeTest = covtypeTest.drop(54)
    check_warnings(regular, 0, covtypeTest)

    covtypeTest = covtypeTest.drop(1)
    covtypeTest=covtypeTest.drop(1)
    check_warnings(regular, 2, covtypeTest)

    covtypeTest = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    covtypeTest[54] = covtype[54].asfactor()
    covtypeTest=covtypeTest.drop(3)
    covtypeTest=covtypeTest.drop(5)
    covtypeTest=covtypeTest.drop(7)
    check_warnings(regular, 3, covtypeTest)
예제 #29
0
def import_multi():
    airlines = h2o.import_file(path=[
        pyunit_utils.locate("smalldata/testng/airlines_train.csv"),
        pyunit_utils.locate("smalldata/testng/airlines_test.csv")
    ])

    assert airlines.nrows == 24421 + 2691
def link_functions_binomial():
  
  

  print("Read in prostate data.")
  h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))
  h2o_data.head()

  sm_data = pd.read_csv(zipfile.ZipFile(pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
  sm_data_response = sm_data[:,2]
  sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]]

  print("Testing for family: BINOMIAL")
  print("Set variables for h2o.")
  myY = "CAPSULE"
  myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"]

  print("Create models with canonical link: LOGIT")
  h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0])
  sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit()

  print("Compare model deviances for link function logit")
  h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
  sm_deviance = sm_model.deviance / sm_model.null_deviance
  assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
예제 #31
0
def glm_alpha_arrays_null_lambda_cv():
    print(
        "Testing glm cross-validation with alpha array, default lambda values for binomial models."
    )
    h2o_data = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    enum_columns = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"
    ]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]

    # build model with CV but no validation dataset
    cv_model = glm(family='binomial', alpha=[0.1, 0.5, 0.9], nfolds=3)
    cv_model.train(training_frame=training_data, x=myX, y=myY)
    cv_r = glm.getGLMRegularizationPath(cv_model)
    # build model with CV and with validation dataset
    cv_model_valid = glm(family='binomial', alpha=[0.1, 0.5, 0.9], nfolds=3)
    cv_model_valid.train(training_frame=training_data,
                         validation_frame=test_data,
                         x=myX,
                         y=myY)
    cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid)

    for l in range(0, len(cv_r['lambdas'])):
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l],
                                           cv_r_valid['coefficients'][l],
                                           tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l],
                                           cv_r_valid['coefficients_std'][l],
                                           tol=1e-6)
예제 #32
0
def hexdev_394():
    path = pyunit_utils.locate("smalldata/covtype/covtype.20k.data")
    c_types = [None] * 55
    c_types[10] = "enum"
    c_types[11] = "enum"
    c_types[12] = "enum"
    train = h2o.import_file(path, col_types=c_types)

    cols = train.col_names  # This returned space for first column name
    x_cols = [colname for colname in cols if colname != "C55"]

    splits = train.split_frame()
    newtrain = splits[0]
    newvalid = splits[1]
    newtrain[54] = newtrain[54].asfactor()
    newvalid[54] = newvalid[54].asfactor()

    my_gbm = H2OGradientBoostingEstimator(distribution="multinomial",
                                          ntrees=100,
                                          learn_rate=0.1,
                                          max_depth=6)
    my_gbm.train(x=x_cols,
                 y=54,
                 training_frame=newtrain,
                 validation_frame=newvalid)

    split1, split2 = train.split_frame()
    split1[54] = split1[54].asfactor()
    split2[54] = split2[54].asfactor()

    my_gbm = H2OGradientBoostingEstimator(distribution="multinomial",
                                          ntrees=100,
                                          learn_rate=0.1,
                                          max_depth=6)
    my_gbm.train(x=x_cols,
                 y=54,
                 training_frame=split1,
                 validation_frame=split2)
def test_interaction_constraints():
    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate.describe()
    prostate[1] = prostate[1].asfactor()

    constraints = [["AGE", "PSA"], ["GLEASON"]]
    ntrees = 5
    prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", 
                                                ntrees=ntrees, 
                                                interaction_constraints=constraints,
                                                seed=42)
    prostate_gbm.train(x=list(range(2, 9)), y=1, training_frame=prostate)

    prostate_gbm.predict(prostate)
    
    importance = prostate_gbm.varimp(use_pandas=True)
    print(importance)
    
    # variables RACE, DPROS, DCAPS, VOL should have zero importance
    assert importance["variable"][3] == "RACE"
    assert importance["relative_importance"][3] == 0
    assert importance["variable"][4] == "DPROS"
    assert importance["relative_importance"][4] == 0
    assert importance["variable"][5] == "DCAPS"
    assert importance["relative_importance"][5] == 0
    assert importance["variable"][6] == "VOL"
    assert importance["relative_importance"][6] == 0
    
    # check trees features
    for i in range(ntrees):
        tree = H2OTree(model=prostate_gbm, tree_number=i)
        tree_features = set(filter(None, tree.features))
        print("iteration: "+str(i))
        print(set(constraints[0]))
        print(set(constraints[1]))
        print(tree_features)
        
        assert tree_features.issubset(set(constraints[0])) or tree_features.issubset(set(constraints[1]))
def test_gridsearch():
    h2o_data = h2o.import_file(path = pyunit_utils.locate("smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv"))
    h2o_data['response'] = h2o_data['response'].asfactor()
    h2o_data['C3'] = h2o_data['C3'].asfactor()
    h2o_data['C7'] = h2o_data['C7'].asfactor()
    h2o_data['C8'] = h2o_data['C8'].asfactor()
    h2o_data['C10'] = h2o_data['C10'].asfactor()
    names = h2o_data.names
    myY = "response"
    myX = names.remove(myY)
    search_criteria = {'strategy': 'Cartesian'}
    hyper_parameters = {'lambda': [1, 2],
                        'subspaces': [{'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs':[[1], [0]], 
                                       'gam_columns': [[["c_0"]], [["c_1"]]]},
                                      {'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 
                                       'bs':[[1, 1, 1], [0, 1, 1]], 
                                       'num_knots': [[5, 10, 12], [6, 11, 13]], 
                                       'gam_columns': [[["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                                                   [["c_1"], ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]}]}
    hyper_parameters2 = {'lambda': [1, 2],
                        'subspaces': [{'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs':[[1], [0]],
                                       'gam_columns': [[["c_0"]], [["c_1"]]]},
                                      {'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]],
                                       'bs':[[1, 1, 1], [0, 1, 1]],
                                       'num_knots': [[5, 10, 12], [6, 11, 13]],
                                       'gam_columns': [["c_0", ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                                                       ["c_1", ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]}]}
    h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="binomial", keep_gam_cols=True, seed=1),
                              hyper_params=hyper_parameters, search_criteria=search_criteria)
    h2o_model.train(x = myX, y = myY, training_frame = h2o_data)
    h2o_model2 = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="binomial", keep_gam_cols=True, seed=1),
                              hyper_params=hyper_parameters2, search_criteria=search_criteria)
    h2o_model2.train(x = myX, y = myY, training_frame = h2o_data)
    # compare two models by checking their coefficients.  They should be the same
    for index in range(0, len(h2o_model)):
        model1 = h2o_model[index]
        model2 = h2o_model2[index]
        pyunit_utils.assertEqualCoeffDicts(model1.coef(), model2.coef(), tol=1e-6)
def cv_nfolds_gbm():
    loan_data = h2o.import_file(
        path=pyunit_utils.locate("bigdata/laptop/lending-club/loan.csv"))
    loan_data["bad_loan"] = loan_data["bad_loan"].asfactor()

    try:
        # parallel main model building cannot be used when we use best CV iterations right now
        set_best_cv(False)
        model_default = H2OGradientBoostingEstimator(nfolds=5,
                                                     distribution="bernoulli",
                                                     ntrees=500,
                                                     score_tree_interval=3,
                                                     stopping_rounds=2,
                                                     seed=42)
        try:
            set_parallel(True)
            model_default.train(y="bad_loan", training_frame=loan_data)
        finally:
            set_parallel(False)
        preds_default = model_default.predict(loan_data)

        model_sequential = H2OGradientBoostingEstimator(
            nfolds=5,
            distribution="bernoulli",
            ntrees=500,
            score_tree_interval=3,
            stopping_rounds=2,
            seed=42)
        model_sequential.train(y="bad_loan", training_frame=loan_data)
        preds_sequential = model_sequential.predict(loan_data)

        assert model_default.actual_params[
            "ntrees"] == model_sequential.actual_params["ntrees"]
        pyunit_utils.compare_frames_local(preds_default,
                                          preds_sequential,
                                          prob=1.0)
    finally:
        set_best_cv(True)
예제 #36
0
def h2ogrid_checkpoints():
    """
    Python API test: H2OGridSearch with export_checkpoints_dir

    Copy from pyunit_gbm_random_grid.py
    """
    air_hex = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"), destination_frame="air.hex")
    myX = ["DayofMonth", "DayOfWeek"]

    hyper_parameters = {
        'ntrees': [5, 10]
    }

    search_crit = {'strategy': "RandomDiscrete",
                   'max_models': 5,
                   'seed': 1234,
                   'stopping_rounds' : 2,
                   'stopping_metric' : "AUTO",
                   'stopping_tolerance': 1e-2
                   }
    checkpoints_dir = tempfile.mkdtemp()

    air_grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, search_criteria=search_crit)
    air_grid.train(x=myX, y="IsDepDelayed", training_frame=air_hex, distribution="bernoulli",
                   learn_rate=0.1,
                   max_depth=3,
                   nfolds=3,
                   export_checkpoints_dir=checkpoints_dir)

    checkpoint_files = listdir(checkpoints_dir) 
    print(checkpoint_files)
    num_files = len(checkpoint_files)
    shutil.rmtree(checkpoints_dir)

    assert_is_type(air_grid, H2OGridSearch)
    assert num_files == 1 + (2 * (1 + 3)), "Unexpected number of checkpoint files" # 1 grid + 1 main model + 3 CV models for each model built
    assert all(model in checkpoint_files for model in air_grid.get_grid().model_ids), \
        "Some models do not have corresponding checkpoints"
def iris_dl_grid():
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run DL

    hidden_opts = [[20, 20], [50, 50, 50]]
    loss_opts = ["Quadratic", "CrossEntropy"]
    size_of_hyper_space = len(hidden_opts) * len(loss_opts)
    hyper_parameters = {"hidden": hidden_opts, "loss": loss_opts}
    print "DL grid with the following hyper_parameters:", hyper_parameters

    gs = H2OGridSearch(H2ODeepLearningEstimator, hyper_params=hyper_parameters)
    gs.train(x=range(4), y=4, training_frame=train)
    print gs.sort_by("mse")

    assert len(gs) == size_of_hyper_space
    total_grid_space = map(list, itertools.product(*hyper_parameters.values()))
    for model in gs.models:
        combo = [model.parms['loss']['actual_value']
                 ] + [model.parms['hidden']['actual_value']]
        assert combo in total_grid_space
        total_grid_space.remove(combo)
예제 #38
0
def pubdev_random_cv():

    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    response_col = "economy"
    distribution = "gaussian"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]

    gbm1 = h2o.gbm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=3,
                   distribution=distribution,
                   fold_assignment="Random")
    gbm2 = h2o.gbm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=3,
                   distribution=distribution,
                   fold_assignment="Random")

    mse1 = gbm1.mse(xval=True)
    mse2 = gbm2.mse(xval=True)
    assert mse1 != mse2, "The first model has an MSE of {0} and the second model has an MSE of {1}. Expected the " \
                         "first to be different from the second.".format(mse1, mse2)
예제 #39
0
def h2oimport_file():
    """
    Python API test: h2o.import_file(path=None, destination_frame=None, parse=True, header=0, sep=None,
    col_names=None, col_types=None, na_strings=None)
    """
    try:
        col_types=['enum','numeric','enum','enum','enum','numeric','numeric','numeric']
        col_headers = ["CAPSULE","AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
        hex_key = "training_data.hex"
        training_data = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"),
                                        destination_frame=hex_key, header=1, sep = ',',
                                        col_names=col_headers, col_types=col_types, na_strings=["NA"])
        assert_is_type(training_data, H2OFrame)
        assert training_data.frame_id == hex_key, "frame_id was not assigned correctly.  h2o.import_file() is not" \
                                                  " working."
        assert len(set(training_data.col_names) & set(col_headers))==len(col_headers), "column names are incorrect.  " \
                                                                                       "h2o.import_file() not working."
        assert training_data.nrow==380, "number of rows is incorrect.  h2o.import_file() is not working."
        assert training_data.ncol==8, "number of columns is incorrect.  h2o.import_file() is not working."
        assert sum(training_data.nacnt())==3, "NA count is incorrect.  h2o.import_file() is not working."

    except Exception as e:
        assert False, "h2o.import_file() command is not working."
    def test_saved_binary_model_produces_same_predictions_as_original():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se_model = train_stacked_ensemble(ds, base_models)

        #Predict in ensemble in Py client
        preds_py = se_model.predict(ds.test)

        tmp_dir = tempfile.mkdtemp()
        try:
            bin_file = h2o.save_model(se_model, tmp_dir)
            #Load binary model and predict
            bin_model = h2o.load_model(pu.locate(bin_file))
            preds_bin = bin_model.predict(ds.test)
        finally:
            shutil.rmtree(tmp_dir)

        #Predictions from model in Py and binary model should be the same
        pred_diff = preds_bin - preds_py
        assert pred_diff["p0"].max() < 1e-11
        assert pred_diff["p1"].max() < 1e-11
        assert pred_diff["p0"].min() > -1e-11
        assert pred_diff["p1"].min() > -1e-11
예제 #41
0
def cv_nfolds_gbm():
    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate[1] = prostate[1].asfactor()
    prostate.summary()

    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    prostate_gbm = H2OGradientBoostingEstimator(nfolds=5,
                                                distribution="bernoulli")
    prostate_gbm.train(x=range(2, 9), y=1, training_frame=prostate)
    prostate_gbm.show()

    # Can specify both nfolds >= 2 and validation data at once
    try:
        H2OGradientBoostingEstimator(nfolds=5, distribution="bernoulli").train(
            x=range(2, 9),
            y=1,
            training_frame=prostate,
            validation_frame=prostate)

        assert True
    except EnvironmentError:
        assert False, "expected an error"
예제 #42
0
def pubdev_5167():
    training_data = h2o.import_file(pyunit_utils.locate("bigdata/laptop/airlines_all.05p.csv"))

    if 'IsDepDelayed' in training_data.names:
        training_data['IsDepDelayed'] = training_data['IsDepDelayed'].asfactor()
    else:
        raise AttributeError("label {0} not found".format('IsDepDelayed'))

    estimator = h2o.estimators.deeplearning.H2ODeepLearningEstimator(hidden=[50, 50, 50, 50, 50],
                                                                     activation='rectifier',
                                                                     adaptive_rate=True,
                                                                     balance_classes=True,
                                                                     epochs=50,
                                                                     shuffle_training_data=True,
                                                                     score_each_iteration=True,
                                                                     stopping_metric='auc',
                                                                     stopping_rounds=5,
                                                                     stopping_tolerance=.01,
                                                                     use_all_factor_levels=False,
                                                                     variable_importances=False,
                                                                     export_weights_and_biases=True,
                                                                     seed=200)
    estimator.train(x=training_data.names[:-1], y=training_data.names[-1], training_frame=training_data)
예제 #43
0
def mojo_model_glm_test():

    # GLM
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    glm = H2OGeneralizedLinearEstimator()
    glm.train(x=["Origin", "Dest"], y="Distance", training_frame=airlines)

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = glm.download_mojo(original_model_filename)

    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
예제 #44
0
def test_target_encoding_fit_method():
    print("Check fit method of the TargetEncoder class")
    targetColumnName = "survived"
    foldColumnName = "kfold_column"  # it is strange that we can't set name for generated kfold

    teColumns = ["home.dest", "cabin", "embarked"]
    targetEncoder = TargetEncoder(x=teColumns,
                                  y=targetColumnName,
                                  fold_column=foldColumnName,
                                  blended_avg=True,
                                  inflection_point=3,
                                  smoothing=1)
    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5,
                                                               seed=1234)

    encodingMap = targetEncoder.fit(frame=trainingFrame)
    assert encodingMap.map_keys['string'] == teColumns
    assert encodingMap.frames[0]['num_rows'] == 583
예제 #45
0
def pubdev_5174():
    x = h2o.import_file(pyunit_utils.locate('smalldata/jira/PUBDEV-5174.csv'),
                        header=1)

    tt = x['rr'].unique()
    gg = tt[:10000, 0]

    ww = x[~x['rr'].isin(gg['C1'].as_data_frame()['C1'].tolist())]

    print(x.nrow)
    print(tt.nrow)
    print(ww.nrow)

    assert x.nrow == 1000000, "Original data has 1000000 rows"
    assert tt.nrow == 499851, "Column rr has 499851 unique values"
    assert ww.nrow == 979992, "Original data reduced has 979992 rows"

    # What do we do with Tuples?

    # there are 2 instances of 'cTeYX' and 2 of 'Todxf'
    tup = ('cTeYX', 'Todxf')
    ww_tuple = x[~x['rr'].isin(tup)]
    assert ww_tuple.nrow == 999996, "Original data reduced has 999996 rows"
def test_infogram_personal_loan_cv_valid():
    """
    Make sure safe infogram plot works with cv and validation dataset.
    """
    fr = h2o.import_file(path=pyunit_utils.locate("smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = ["Experience","Income","Family","CCAvg","Education","Mortgage",
         "Securities Account","CD Account","Online","CreditCard"]
    splits = fr.split_frame(ratios=[0.80])
    train = splits[0]
    test = splits[1]
    infogram_model_cv_v = H2OInfogram(seed = 12345, protected_columns=["Age","ZIP Code"], nfolds=5) 
    infogram_model_cv_v.train(x=x, y=target, training_frame=train, validation_frame=test) # cross-validation, validation
    infogram_model_cv_v.plot(title="Infogram calcuated from training dataset", server=True) # plot infogram from training dataset
    infogram_model_cv_v.plot(train=True, valid=True, title="Infogram calculated from training/validation dataset", 
                             server=True) # plot infogram from validation dataset
    infogram_model_cv_v.plot(train=True, valid=True, xval=True, title="Infogram calculated from "
                                                                      "training/validation/xval holdout dataset",
                             server=True) # plot infogram from cv hold out dataset
    relcmi_train = infogram_model_cv_v.get_admissible_score_frame()
    relcmi_valid = infogram_model_cv_v.get_admissible_score_frame(valid=True)
    assert relcmi_train.nrow==relcmi_valid.nrow
예제 #47
0
    def setup_data(self):
        """
        This function performs all initializations necessary:
        load the data sets and set the training set indices and response column index
        """

        # create and clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(
            self.current_dir, self.test_name, True)

        # preload datasets
        self.training1_data = h2o.import_file(
            path=pyunit_utils.locate(self.training1_filename))

        # set data set indices for predictors and response
        self.y_index = self.training1_data.ncol - 1
        self.x_indices = list(range(self.y_index))

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir,
                                      ".csv",
                                      action='copy',
                                      new_dir_path=self.sandbox_dir)
예제 #48
0
def pca_prostate():

    print("Importing prostate.csv data...\n")
    prostate = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    print("Converting CAPSULE, RACE, DPROS and DCAPS columns to factors")
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate["RACE"] = prostate["RACE"].asfactor()
    prostate["DPROS"] = prostate["DPROS"].asfactor()
    prostate["DCAPS"] = prostate["DCAPS"].asfactor()
    prostate.describe()

    print(
        "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'"
    )

    fitPCA = H2OPCA(k=3, transform="NONE", pca_method="Power")
    fitPCA.train(x=list(range(2, 9)), training_frame=prostate)
    pred = fitPCA.predict(prostate)

    print("Projection matrix:\n")
    pred.head()
예제 #49
0
def perfect_separation_balanced():
    print("Read in synthetic balanced dataset")
    data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/synthetic_perfect_separation/balanced.csv"))

    print("Fit model on dataset")
    model = H2OGeneralizedLinearEstimator(family="binomial",
                                          lambda_search=True,
                                          alpha=0.5,
                                          Lambda=1e-8)
    model.train(x=["x1", "x2"], y="y", training_frame=data)

    print(
        "Extract models' coefficients and assert reasonable values (ie. no greater than 50)"
    )
    print("Balanced dataset")
    coef = [
        c[1]
        for c in model._model_json['output']['coefficients_table'].cell_values
        if c[0] != "Intercept"
    ]
    for c in coef:
        assert c < 50, "coefficient is too large"
def group_by():
    '''
    This test checks that if a groupby operation is specified for frames with string columns, a warning is
    generated about the string columns being skipped.

    In addition, it checks that operations on numeric/enum columns are performed and generated the correct
    expected outputs.
    
    '''
    # Connect to a pre-existing cluster

    buffer = StringIO()  # redirect output
    sys.stderr = buffer
    h2o_f1 = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/jira/test_groupby_with_strings.csv"),
                             col_types=['real', 'string', 'string', 'real'])
    grouped = h2o_f1.group_by("C1")
    grouped.mean(na="all").median(na="all").max(na="all").min(na="all").sum(
        na="all")
    print(grouped.get_frame())
    print("Checking number of warning messages...")
    check_warnings(
        2, buffer)  # make sure we receieved two warning, one per string row
예제 #51
0
def glrm_arrests():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsH2O.describe()

    print("H2O initial Y matrix:\n")
    initial_y = [[5.412, 65.24, -7.54, -0.032], [2.212, 92.24, -17.54, 23.268],
                 [0.312, 123.24, 14.46, 9.768], [1.012, 19.24, -15.54, -1.732]]
    initial_y_h2o = h2o.H2OFrame(list(zip(*initial_y)))
    initial_y_h2o.show()

    print("H2O GLRM on de-meaned data with quadratic loss:\n")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=4,
                                              transform="DEMEAN",
                                              loss="Quadratic",
                                              gamma_x=0,
                                              gamma_y=0,
                                              init="User",
                                              user_y=initial_y_h2o,
                                              recover_svd=True)
    glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O)
    glrm_h2o.show()
def glrm_arrests():
    print "Importing USArrests.csv data..."
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsH2O.describe()

    print "H2O initial Y matrix:\n"
    initial_y = [[5.412, 65.24, -7.54, -0.032], [2.212, 92.24, -17.54, 23.268],
                 [0.312, 123.24, 14.46, 9.768], [1.012, 19.24, -15.54, -1.732]]
    initial_y_h2o = h2o.H2OFrame(initial_y)
    initial_y_h2o.show()

    print "H2O GLRM on de-meaned data with quadratic loss:\n"
    glrm_h2o = h2o.glrm(x=arrestsH2O,
                        k=4,
                        transform="DEMEAN",
                        loss="Quadratic",
                        gamma_x=0,
                        gamma_y=0,
                        init="User",
                        user_y=initial_y_h2o,
                        recover_svd=True)
    glrm_h2o.show()
예제 #53
0
def bigcatRF():

    # Training set has 100 categories from cat001 to cat100
    # Categories cat001, cat003, ... are perfect predictors of y = 1
    # Categories cat002, cat004, ... are perfect predictors of y = 0

    #Log.info("Importing bigcat_5000x2.csv data...\n")
    bigcat = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
    bigcat["y"] = bigcat["y"].asfactor()

    #Log.info("Summary of bigcat_5000x2.csv from H2O:\n")
    #bigcat.summary()

    # Train H2O DRF Model:
    #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n")
    model = h2o.random_forest(x=bigcat[["X"]],
                              y=bigcat["y"],
                              ntrees=1,
                              max_depth=1,
                              nbins=100,
                              nbins_cats=10)
    model.show()
def link_functions_tweedie_basic():
    print "Read in prostate data."
    hdf = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))

    print "Testing for family: TWEEDIE"
    print "Set variables for h2o."
    y = "CAPSULE"
    x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"]

    print "Create models with canonical link: TWEEDIE"
    model_h2o_tweedie = H2OGeneralizedLinearEstimator(family="tweedie",
                                                      link="tweedie",
                                                      alpha=0.5,
                                                      Lambda=0)
    model_h2o_tweedie.train(x=x, y=y, training_frame=hdf)

    print "Compare model deviances for link function tweedie (using precomputed values from R)"
    deviance_h2o_tweedie = model_h2o_tweedie.residual_deviance(
    ) / model_h2o_tweedie.null_deviance()

    assert 0.721452 - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than R's. h2o: " \
                                                    "{0}, r: {1}".format(deviance_h2o_tweedie, 0.721452)
def parametersKmeans():

    print "Getting data..."
    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    print "Create and and duplicate..."
    iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234)
    parameters = iris_km._model_json['parameters']
    param_dict = {}
    for p in range(len(parameters)):
        param_dict[parameters[p]['label']] = parameters[p]['actual_value']

    iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict)

    print "wss"
    wss = iris_km.withinss().sort()
    wss_again = iris_km_again.withinss().sort()
    assert wss == wss_again, "expected wss to be equal"

    print "centers"
    centers = iris_km.centers()
    centers_again = iris_km_again.centers()
    assert centers == centers_again, "expected centers to be the same"
 def setup_data(self):
     """
     This function performs all initializations necessary:
     load the data sets and set the training set indices and response column index
     """
     self.h2o_data = h2o.import_file(path=pyunit_utils.locate(
         "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
     self.h2o_data["C1"] = self.h2o_data["C1"].asfactor()
     self.h2o_data["C2"] = self.h2o_data["C2"].asfactor()
     self.myX = ["C1", "C2"]
     self.myY = "C21"
     for lambda_param in self.hyper_parameters['lambda']:
         for alpha_param in self.hyper_parameters['alpha']:
             self.manual_gam_models.append(
                 H2OGeneralizedAdditiveEstimator(
                     family="gaussian",
                     gam_columns=["C11", "C12", "C13"],
                     keep_gam_cols=True,
                     scale=[1, 1, 1],
                     num_knots=[5, 5, 5],
                     alpha=alpha_param,
                     lambda_=lambda_param,
                     bs=[2, 0, 2]))
def vi_toy_test():

    toy_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gbm_test/toy_data_RF.csv"))
    #toy_data.summary()

    toy_data[6] = toy_data[6].asfactor()
    toy_data.show()
    rf = h2o.random_forest(x=toy_data[[0, 1, 2, 3, 4, 5]],
                           y=toy_data[6],
                           ntrees=500,
                           max_depth=20,
                           nbins=100,
                           seed=0)

    ranking = [
        rf._model_json['output']['variable_importances'].cell_values[v][0]
        for v in range(toy_data.ncol - 1)
    ]
    print(ranking)
    assert tuple(ranking) == tuple(
        ["V3", "V2", "V6", "V5", "V1",
         "V4"]), "expected specific variable importance ranking"
예제 #58
0
def weights_gamma():

    htable = h2o.upload_file(
        pyunit_utils.locate("smalldata/gbm_test/moppe.csv"))
    htable["premiekl"] = htable["premiekl"].asfactor()
    htable["moptva"] = htable["moptva"].asfactor()
    htable["zon"] = htable["zon"]

    hh = H2OGradientBoostingEstimator(distribution="gamma",
                                      ntrees=20,
                                      max_depth=1,
                                      min_rows=1,
                                      learn_rate=1)
    hh.train(x=range(3),
             y="medskad",
             training_frame=htable,
             weights_column="antskad")
    ph = hh.predict(htable)

    assert abs(8.804447 - hh._model_json['output']['init_f']) < 1e-6 * 8.804447
    assert abs(3751.01 - ph[0].min()) < 1e-4 * 3751.01
    assert abs(15298.87 - ph[0].max()) < 1e-4 * 15298.87
    assert abs(8121.98 - ph[0].mean()[0]) < 1e-4 * 8121.98
예제 #59
0
def test_teColumns_parameter_as_single_element():
    print("Check fit method can accept non-array single column to encode")
    targetColumnName = "survived"
    foldColumnName = "kfold_column"  # it is strange that we can't set name for generated kfold

    teColumns = "home.dest"
    targetEncoder = TargetEncoder(x=teColumns,
                                  y=targetColumnName,
                                  fold_column=foldColumnName,
                                  blending_avg=True,
                                  inflection_point=3,
                                  smoothing=1)
    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5,
                                                               seed=1234)

    encodingMap = targetEncoder.fit(frame=trainingFrame)
    assert encodingMap.map_keys['string'] == [teColumns]
    assert encodingMap.frames[0]['num_rows'] == 583
예제 #60
0
def group_by():
    # Connect to a pre-existing cluster

    h2o_iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    na_handling = ["rm", "ignore", "all"]

    print("Running smoke test")
    # smoke test
    for na in na_handling:
        grouped = h2o_iris.group_by("class")
        grouped \
          .count(na=na) \
          .min(  na=na) \
          .max(  na=na) \
          .mean( na=na) \
          .var(  na=na) \
          .sd(   na=na) \
          .ss(   na=na) \
          .sum(  na=na)
        print(grouped.get_frame())
        print(grouped.get_frame()
              )  # call get_frame() again to ensure that Pasha bug fix works.