Exemplo n.º 1
0
def fiftycatGBM(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  # Training set has only 45 categories cat1 through cat45
  #Log.info("Importing 50_cattest_train.csv data...\n")
  train = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
  train["y"] = train["y"].asfactor()

  #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
  #train.summary()
  
  # Train H2O GBM Model:
  #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
  model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20)
  model.show()
 
  # Test dataset has all 50 categories cat1 through cat50
  #Log.info("Importing 50_cattest_test.csv data...\n")
  test = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))
  #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
  #test.summary()
  
  # Predict on test dataset with GBM model:
  #Log.info("Performing predictions on test dataset...\n")
  predictions = model.predict(test)
  predictions.show()
  
  # Get the confusion matrix and AUC
  #Log.info("Confusion matrix of predictions (max accuracy):\n")
  performance = model.model_performance(test)
  test_cm = performance.confusion_matrix()
  test_auc = performance.auc()
Exemplo n.º 2
0
def frame_slicing(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip"))
    iris.show()
    prostate.show()
    airlines.show()

    ###################################################################

    # H2OFrame[int] (column slice)
    res1 = h2o.as_list(iris[0])
    assert abs(res1[8][0] - 4.4) < 1e-10, "incorrect values"

    # H2OFrame[int,int]
    res2 = h2o.as_list(prostate[13, 3])
    assert abs(res2[0][0] - 1) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res3 = h2o.as_list(airlines[12, 0:3])
    assert abs(res3[0][0] - 1987) < 1e-10 and abs(res3[0][1] - 10) < 1e-10 and abs(res3[0][2] - 29) < 1e-10, \
        "incorrect values"

    # H2OFrame[slice, int]
    res4 = h2o.as_list(iris[5:8, 1])
    assert abs(res4[0][0] - 3.9) < 1e-10 and abs(res4[1][0] - 3.4) < 1e-10 and abs(res4[2][0] - 3.4) < 1e-10 and \
           abs(res4[3][0] - 2.9) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res5 = h2o.as_list(prostate[5:8, 0:3])
    assert abs(res5[0][0] - 6) < 1e-10 and abs(res5[1][1] - 0) < 1e-10 and abs(res5[2][2] - 61) < 1e-10, "incorrect values"
Exemplo n.º 3
0
def fiftycatRF(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    # Training set has only 45 categories cat1 through cat45
    # Log.info("Importing 50_cattest_train.csv data...\n")
    train = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
    train["y"] = train["y"].asfactor()

    # Log.info("Summary of 50_cattest_train.csv from H2O:\n")
    # train.summary()

    # Train H2O DRF Model:
    # Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
    model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500)

    # Test dataset has all 50 categories cat1 through cat50
    # Log.info("Importing 50_cattest_test.csv data...\n")
    test = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))

    # Log.info("Summary of 50_cattest_test.csv from H2O:\n")
    # test.summary()

    # Predict on test dataset with DRF model:
    # Log.info("Performing predictions on test dataset...\n")
    preds = model.predict(test)
    preds.head()

    # Get the confusion matrix and AUC
    # Log.info("Confusion matrix of predictions (max accuracy):\n")
    perf = model.model_performance(test)
    perf.show()
    cm = perf.confusion_matrix()
    print(cm)
def anomaly(ip, port):
    h2o.init(ip, port)

    print "Deep Learning Anomaly Detection MNIST"

    train = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
    test = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz"))

    predictors = range(0,784)
    resp = 784

    # unsupervised -> drop the response column (digit: 0-9)
    train = train[predictors]
    test = test[predictors]

    # 1) LEARN WHAT'S NORMAL
    # train unsupervised Deep Learning autoencoder model on train_hex
    ae_model = h2o.deeplearning(x=train[predictors], training_frame=train, activation="Tanh", autoencoder=True,
                                hidden=[50], l1=1e-5, ignore_const_cols=False, epochs=1)

    # 2) DETECT OUTLIERS
    # anomaly app computes the per-row reconstruction error for the test data set
    # (passing it through the autoencoder model and computing mean square error (MSE) for each row)
    test_rec_error = ae_model.anomaly(test)

    # 3) VISUALIZE OUTLIERS
    # Let's look at the test set points with low/median/high reconstruction errors.
    # We will now visualize the original test set points and their reconstructions obtained
    # by propagating them through the narrow neural net.

    # Convert the test data into its autoencoded representation (pass through narrow neural net)
    test_recon = ae_model.predict(test)
Exemplo n.º 5
0
def frame_slicing(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip"))
    iris.show()
    prostate.show()
    airlines.show()

    ###################################################################

    # H2OFrame[int] (column slice)
    res1 = iris[0]
    assert abs(res1[8] - 4.4) < 1e-10, "incorrect values"

    # H2OFrame[int,int]
    res2 = prostate[13, 3]
    assert abs(res2 - 1) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res3 = airlines[12, 0:3]
    assert abs(res3[0,0] - 1987) < 1e-10 and abs(res3[0,1] - 10) < 1e-10 and abs(res3[0,2] - 29) < 1e-10, \
        "incorrect values"

    # H2OFrame[slice, int]
    res4 = iris[5:8, 1]
    assert abs(res4[0] - 3.9) < 1e-10 and abs(res4[1] - 3.4) < 1e-10 and abs(res4[2] - 3.4) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res5 = prostate[5:8, 0:3]
    assert abs(res5[0,0] - 6) < 1e-10 and abs(res5[1,1] - 0) < 1e-10 and abs(res5[2,2] - 61) < 1e-10, "incorrect values"
Exemplo n.º 6
0
def asnumeric(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    h2oframe =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv"))
    rows = h2oframe.nrow()

    h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter()
    assert h2oframe["cylinders"].isfactor(), "expected the column to be a factor"

    # H2OFrame case
    h2oframe = h2o.asnumeric(h2oframe)
    h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders']
    h2oframe = h2oframe[h2oframe['cylinders'] == 0]
    assert h2oframe.nrow() == rows, "expected the same number of rows as before {0}, but got {1}".format(rows, h2oframe.nrow())

    h2oframe =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv"))
    h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter()
    assert h2oframe["cylinders"].isfactor(), "expected the column to be a factor"

    # H2OVec case
    h2oframe['cylinders'] = h2o.asnumeric(h2oframe['cylinders'])
    h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders']
    h2oframe = h2oframe[h2oframe['cylinders'] == 0]
    assert h2oframe.nrow() == rows, "expected the same number of rows as before {0}, but got {1}".format(rows, h2oframe.nrow())
Exemplo n.º 7
0
def hit_ratio_test(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    air_train = h2o.import_frame(
        path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    air_valid = h2o.import_frame(
        path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    air_test = h2o.import_frame(
        path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))

    gbm_mult = h2o.gbm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed",
        "fDayofMonth", "fMonth"
    ]],
                       y=air_train["fDayOfWeek"].asfactor(),
                       validation_x=air_valid[[
                           "Origin", "Dest", "Distance", "UniqueCarrier",
                           "IsDepDelayed", "fDayofMonth", "fMonth"
                       ]],
                       validation_y=air_valid["fDayOfWeek"].asfactor(),
                       distribution="multinomial")

    training_hit_ratio_table = gbm_mult.hit_ratio_table(train=True)
    training_hit_ratio_table.show()

    validation_hit_ratio_table = gbm_mult.hit_ratio_table(valid=True)
    validation_hit_ratio_table.show()

    perf = gbm_mult.model_performance(air_test)
    test_hit_ratio_table = perf.hit_ratio_table()
    test_hit_ratio_table.show()
Exemplo n.º 8
0
def fiftycatGBM(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  # Training set has only 45 categories cat1 through cat45
  #Log.info("Importing 50_cattest_train.csv data...\n")
  train = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
  train["y"] = train["y"].asfactor()

  #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
  #train.summary()
  
  # Train H2O GBM Model:
  #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
  model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], loss="bernoulli", ntrees=10, max_depth=5, nbins=20)
  model.show()
 
  # Test dataset has all 50 categories cat1 through cat50
  #Log.info("Importing 50_cattest_test.csv data...\n")
  test = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))
  #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
  #test.summary()
  
  # Predict on test dataset with GBM model:
  #Log.info("Performing predictions on test dataset...\n")
  predictions = model.predict(test)
  predictions.show()
  
  # Get the confusion matrix and AUC
  #Log.info("Confusion matrix of predictions (max accuracy):\n")
  performance = model.model_performance(test)
  test_cm = performance.confusion_matrices()
  test_auc = performance.auc()
Exemplo n.º 9
0
def asnumeric(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    h2oframe = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv"))
    rows = h2oframe.nrow()

    h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter()
    assert h2oframe["cylinders"].isfactor(
    ), "expected the column to be a factor"

    # H2OFrame case
    h2oframe = h2o.asnumeric(h2oframe)
    h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders']
    h2oframe = h2oframe[h2oframe['cylinders'] == 0]
    assert h2oframe.nrow(
    ) == rows, "expected the same number of rows as before {0}, but got {1}".format(
        rows, h2oframe.nrow())

    h2oframe = h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv"))
    h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter()
    assert h2oframe["cylinders"].isfactor(
    ), "expected the column to be a factor"

    # H2OVec case
    h2oframe['cylinders'] = h2o.asnumeric(h2oframe['cylinders'])
    h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders']
    h2oframe = h2oframe[h2oframe['cylinders'] == 0]
    assert h2oframe.nrow(
    ) == rows, "expected the same number of rows as before {0}, but got {1}".format(
        rows, h2oframe.nrow())
Exemplo n.º 10
0
def get_model_test(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    # Regression
    regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian")
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._key)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli")
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv"))
    km_h2o = h2o.kmeans(x=benign_h2o, k=3)
    benign_km = h2o.get_model(km_h2o._key)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy')
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._key)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
Exemplo n.º 11
0
def bernoulliGBM(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  #Log.info("Importing prostate.csv data...\n")
  prostate_train = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate_train.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

  #Log.info("H2O Summary of prostate frame:\n")
  #prostate.summary()

  # Import prostate_train.csv as numpy array for scikit comparison
  trainData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1)
  trainDataResponse = trainData[:,0]
  trainDataFeatures = trainData[:,1:]

  ntrees = 100
  learning_rate = 0.1
  depth = 5
  min_rows = 10
  # Build H2O GBM classification model:
  #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5,
  # min_rows = 10, learn_rate = 0.1\n", sep = ""))
  gbm_h2o = h2o.gbm(x=prostate_train[1:], y=prostate_train["CAPSULE"], ntrees=ntrees, learn_rate=learning_rate,
                    max_depth=depth, min_rows=min_rows, distribution="bernoulli")

  # Build scikit GBM classification model
  #Log.info("scikit GBM with same parameters\n")
  gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth,
                                                min_samples_leaf=min_rows, max_features=None)
  gbm_sci.fit(trainDataFeatures,trainDataResponse)

  #Log.info("Importing prostate_test.csv data...\n")
  prostate_test = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate_test.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor()

  # Import prostate_test.csv as numpy array for scikit comparison
  testData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1)
  testDataResponse = testData[:,0]
  testDataFeatures = testData[:,1:]

  # Score on the test data and compare results

  # scikit
  auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1])

  # h2o
  gbm_perf = gbm_h2o.model_performance(prostate_test)
  auc_h2o = gbm_perf.auc()

  #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
  assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
Exemplo n.º 12
0
def headers(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    headers = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k_headers_only.csv"))
    headers_and = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k.zip"), col_names=headers)
    print headers.names()
    print headers_and.names()
    assert headers.names() == headers_and.names(), "Expected the same column names but got {0} and {1}". \
        format(headers.names(), headers_and.names())
def deeplearning_autoencoder(ip, port):
    h2o.init(ip, port)

    resp = 784
    nfeatures = 20  # number of features (smallest hidden layer)

    train_hex = h2o.import_frame(
        h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
    test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz"))

    # split data into two parts
    sid = train_hex[1].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(
        x=train_unsupervised.drop(resp),
        activation="Tanh",
        autoencoder=True,
        hidden=[nfeatures],
        epochs=1,
        reproducible=True,  #slow, turn off for real problems
        seed=1234)

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised, 0)
    train_supervised_features.describe()

    assert train_supervised_features.ncol(
    ) == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(x=train_supervised_features,
                                  y=train_supervised[resp].asfactor(),
                                  ntrees=10,
                                  seed=1234)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex.drop(resp), 0)
    test_features.cbind(test_hex[resp])

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] -
               0.1057) < 0.001, "Error not as expected"
def pub_445_long_request_uri(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    mnistTrain = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
    mnistTest = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz"))

    mnistTrain[784]._name = "label"
    mnistTest[784]._name = "label"

    mnistModel = h2o.gbm(x=mnistTrain.drop("label"), y=mnistTrain["label"], validation_x=mnistTest.drop("label"), validation_y=mnistTest["label"], ntrees=100, max_depth=10)
Exemplo n.º 15
0
def hdfs_basic(ip, port):
    h2o.init(ip, port)

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = h2o.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_iris_dir = "/datasets/runit/iris_test_train"

        #----------------------------------------------------------------------
        # Single file cases.
        #----------------------------------------------------------------------

        print "Testing single file importHDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_frame(url)
        iris_h2o.head()
        iris_h2o.tail()
        n = iris_h2o.nrow()
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 150)
        assert isinstance(
            iris_h2o,
            h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format(
                type(iris_h2o))
        print "Import worked"

        #----------------------------------------------------------------------
        # Directory file cases.
        #----------------------------------------------------------------------

        print "Testing directory importHDFS"
        urls = [
            "hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node,
                                                 hdfs_iris_dir),
            "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node,
                                                  hdfs_iris_dir)
        ]
        iris_dir_h2o = h2o.import_frame(urls)
        iris_dir_h2o.head()
        iris_dir_h2o.tail()
        n = iris_dir_h2o.nrow()
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 150)
        assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\
            format(type(iris_dir_h2o))
        print "Import worked"
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 16
0
def frame_show(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip"))

    iris.show()
    prostate.show()
    airlines.show()
Exemplo n.º 17
0
def bernoulliGBM(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  #Log.info("Importing prostate.csv data...\n")
  prostate_train = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate_train.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

  #Log.info("H2O Summary of prostate frame:\n")
  #prostate.summary()

  # Import prostate_train.csv as numpy array for scikit comparison
  trainData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1)
  trainDataResponse = trainData[:,0]
  trainDataFeatures = trainData[:,1:]

  ntrees = 100
  learning_rate = 0.1
  depth = 5
  min_rows = 10
  # Build H2O GBM classification model:
  #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5, min_rows = 10, learn_rate = 0.1\n", sep = ""))
  gbm_h2o = h2o.gbm(x=prostate_train[1:], y=prostate_train["CAPSULE"], ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli")

  # Build scikit GBM classification model
  #Log.info("scikit GBM with same parameters\n")
  gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth, min_samples_leaf=min_rows, max_features=None)
  gbm_sci.fit(trainDataFeatures,trainDataResponse)

  #Log.info("Importing prostate_test.csv data...\n")
  prostate_test = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate_test.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor()

  # Import prostate_test.csv as numpy array for scikit comparison
  testData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1)
  testDataResponse = testData[:,0]
  testDataFeatures = testData[:,1:]

  # Score on the test data and compare results

  # scikit
  auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1])

  # h2o
  gbm_perf = gbm_h2o.model_performance(prostate_test)
  auc_h2o = gbm_perf.auc()

  #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
  assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
Exemplo n.º 18
0
def col_names_check(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris_wheader = h2o.import_frame(h2o.locate("smalldata/iris/iris_wheader.csv"))
    assert iris_wheader.col_names() == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
        "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
                                                           iris_wheader.col_names())

    iris = h2o.import_frame(h2o.locate("smalldata/iris/iris.csv"))
    assert iris.col_names() == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
                                                           "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names())
def deeplearning_autoencoder(ip, port):
  h2o.init(ip, port)

  resp = 784
  nfeatures = 20 # number of features (smallest hidden layer)


  train_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
  test_hex = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz"))

  # split data into two parts
  sid = train_hex[1].runif(1234)

  # unsupervised data for autoencoder
  train_unsupervised = train_hex[sid >= 0.5]
  train_unsupervised.describe()

  # supervised data for drf
  train_supervised = train_hex[sid < 0.5]
  train_supervised.describe()

  # train autoencoder
  ae_model = h2o.deeplearning(x=train_unsupervised.drop(resp),
                              y=train_unsupervised[resp], #ignored (pick any non-constant)
                              activation="Tanh",
                              autoencoder=True,
                              hidden=[nfeatures],
                              epochs=1,
                              reproducible=True, #slow, turn off for real problems
                              seed=1234)

  # conver train_supervised with autoencoder to lower-dimensional space
  train_supervised_features = ae_model.deepfeatures(train_supervised, 0)
  train_supervised_features.describe()

  assert train_supervised_features.ncol() == nfeatures, "Dimensionality of reconstruction is wrong!"

  # Train DRF on extracted feature space
  drf_model = h2o.random_forest(x=train_supervised_features,
                                y=train_supervised[resp].asfactor(),
                                ntrees=10,
                                seed=1234)

  # Test the DRF model on the test set (processed through deep features)
  test_features = ae_model.deepfeatures(test_hex.drop(resp), 0)
  test_features.cbind(test_hex[resp])

  # Confusion Matrix and assertion
  cm = drf_model.confusionMatrix(test_features)
  cm.show()

  # 10% error +/- 0.001
  assert abs(cm["Totals", "Error"] - 0.1038) < 0.001, "Error not as expected"
Exemplo n.º 20
0
def pub_445_long_request_uri(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    mnistTrain = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
    mnistTest = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz"))

    mnistTrain.setName(col=784, name="label")
    mnistTest.setName(col=784, name="label")

    mnistModel = h2o.gbm(x=mnistTrain[0:784], y=mnistTrain["label"], validation_x=mnistTest[0:784],
                     validation_y=mnistTest["label"], ntrees=100, max_depth=10)
Exemplo n.º 21
0
def headers(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    headers = h2o.import_frame(
        h2o.locate("smalldata/airlines/allyears2k_headers_only.csv"))
    headers_and = h2o.import_frame(
        h2o.locate("smalldata/airlines/allyears2k.zip"), col_names=headers)
    print headers.names()
    print headers_and.names()
    assert headers.names() == headers_and.names(), "Expected the same column names but got {0} and {1}". \
        format(headers.names(), headers_and.names())
Exemplo n.º 22
0
def col_names_check(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris_wheader = h2o.import_frame(
        h2o.locate("smalldata/iris/iris_wheader.csv"))
    assert iris_wheader.col_names() == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
        "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
                                                           iris_wheader.col_names())

    iris = h2o.import_frame(h2o.locate("smalldata/iris/iris.csv"))
    assert iris.col_names() == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
                                                           "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names())
Exemplo n.º 23
0
def frame_as_list(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))

  print (prostate % 10).show()
  print (prostate[4] % 10).show()


  airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k_headers.zip"))

  print (airlines["CRSArrTime"] % 100).show()
Exemplo n.º 24
0
def frame_show(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_frame(
        path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_frame(
        path=h2o.locate("smalldata/airlines/allyears2k.zip"))

    iris.show()
    prostate.show()
    airlines.show()
Exemplo n.º 25
0
def cupMediumGBM(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  train = h2o.import_frame(path=h2o.locate("bigdata/laptop/usecases/cup98LRN_z.csv"))
  test = h2o.import_frame(path=h2o.locate("bigdata/laptop/usecases/cup98VAL_z.csv"))

  train["TARGET_B"] = train["TARGET_B"].asfactor()

  # Train H2O GBM Model:
  train_cols = train.names()
  for c in ['', "TARGET_D", "TARGET_B", "CONTROLN"]:
    train_cols.remove(c)
  model = h2o.gbm(x=train[train_cols], y=train["TARGET_B"], distribution = "bernoulli", ntrees = 5)
Exemplo n.º 26
0
def frame_as_list(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    prostate = h2o.import_frame(
        path=h2o.locate("smalldata/prostate/prostate.csv.zip"))

    (prostate % 10).show()
    (prostate[4] % 10).show()

    airlines = h2o.import_frame(
        path=h2o.locate("smalldata/airlines/allyears2k_headers.zip"))

    (airlines["CRSArrTime"] % 100).show()
Exemplo n.º 27
0
def cupMediumGBM(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  train = h2o.import_frame(path=h2o.locate("bigdata/laptop/usecases/cup98LRN_z.csv"))
  test = h2o.import_frame(path=h2o.locate("bigdata/laptop/usecases/cup98VAL_z.csv"))

  train["TARGET_B"] = train["TARGET_B"].asfactor()

  # Train H2O GBM Model:
  train_cols = train.names()
  for c in ['C1', "TARGET_D", "TARGET_B", "CONTROLN"]:
    train_cols.remove(c)
  model = h2o.gbm(x=train[train_cols], y=train["TARGET_B"], distribution = "bernoulli", ntrees = 5)
Exemplo n.º 28
0
def expr_slicing(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    iris.show()

    ###################################################################

    # H2OFrame[int] (column slice)
    res = 2 - iris
    res2 = res[0]
    assert abs(res2[3,:] - -2.6) < 1e-10 and abs(res2[17,:] - -3.1) < 1e-10 and abs(res2[24,:] - -2.8) < 1e-10, \
        "incorrect values"

    # H2OFrame[int,int]
    assert abs(res[13, 3] - 1.9) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res4 = res[12, 0:4]
    assert abs(res4[0,0] - -2.8) < 1e-10 and abs(res4[0,1] - -1.0) < 1e-10 and abs(res4[0,2] - 0.6) < 1e-10 and \
        abs(res4[0,3] - 1.9) < 1e-10, "incorrect values"

    # H2OFrame[slice, int]
    res5 = res[5:9, 1]
    assert abs(res5[0,:] - -1.9) < 1e-10 and abs(res5[1,:] - -1.4) < 1e-10 and abs(res5[2,:] - -1.4) < 1e-10 and \
           abs(res5[3,:] - -0.9) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res = iris * 2
    res6 = res[5:9, 0:4]
    assert abs(res6[0,0] - 10.8) < 1e-10 and abs(res6[1,1] - 6.8) < 1e-10 and abs(res6[2,2] - 3.0) < 1e-10 and \
           abs(res6[3,3] - 0.4) < 1e-10, "incorrect values"
def deep_learning_metrics_test(ip, port):
    h2o.init(ip, port)  # connect to existing cluster
    df = h2o.import_frame(path="smalldata/logreg/prostate.csv")

    del df['ID']  # remove ID
    df['CAPSULE'] = df['CAPSULE'].asfactor()  # make CAPSULE categorical
    vol = df['VOL']
    vol[vol == 0] = None  # 0 VOL means 'missing'

    r = vol.runif()  # random train/test split
    train = df[r < 0.8]
    test = df[r >= 0.8]

    # See that the data is ready
    train.describe()
    train.head()
    test.describe()
    test.head()

    # Run DeepLearning

    print "Train a Deeplearning model: "
    dl = h2o.deeplearning(x=train[1:],
                          y=train['CAPSULE'],
                          epochs=100,
                          hidden=[10, 10, 10])
    print "Binomial Model Metrics: "
    print
    dl.model_performance(test).show()
Exemplo n.º 30
0
def iris_h2o_vs_sciKmeans(ip,port):
  # Connect to a pre-existing cluster
  h2o.init(ip,port)  # connect to localhost:54321

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
  iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',')
  iris_sci = iris_sci[:,0:4]

  s =[[4.9,3.0,1.4,0.2],
  [5.6,2.5,3.9,1.1],
  [6.5,3.0,5.2,2.0]]

  start = h2o.H2OFrame(s)
  start_key = start.send_frame()

  h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False)

  sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
  sci_km.fit(iris_sci)

  # Log.info("Cluster centers from H2O:")
  print "Cluster centers from H2O:"
  h2o_centers = h2o_km.centers()
  print h2o_centers

  # Log.info("Cluster centers from scikit:")
  print "Cluster centers from scikit:"
  sci_centers = sci_km.cluster_centers_.tolist()
  print sci_centers

  for hcenter, scenter in zip(h2o_centers, sci_centers):
    for hpoint, spoint in zip(hcenter,scenter):
      assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
def getModelKmeans(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    #Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_frame(
        path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"),
                               delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = h2o.kmeans(x=benign_h2o, k=i)
        km_h2o.show()
        #TODO: impement h2o.getModel()
        model = h2o.getModel(km_h2o._key)
        model.show()

        km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
def cv_nfoldsGBM(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    prostate = h2o.import_frame(
        path=h2o.locate("smalldata/logreg/prostate.csv"))
    #prostate.summary()

    prostate_gbm = h2o.gbm(y=prostate[1],
                           x=prostate[2:9],
                           nfolds=5,
                           distribution="bernoulli")
    prostate_gbm.show()

    # Can't specify both nfolds >= 2 and validation data at once
    try:
        h2o.gbm(y=prostate[1],
                x=prostate[2:9],
                nfolds=5,
                validation_y=prostate[1],
                validation_x=prostate[2:9],
                distribution="bernoulli")
        assert False, "expected an error"
    except EnvironmentError:
        assert True
Exemplo n.º 33
0
def bigcatGBM(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    #Log.info("Importing bigcat_5000x2.csv data...\n")
    bigcat = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
    bigcat["y"] = bigcat["y"].asfactor()
    #Log.info("Summary of bigcat_5000x2.csv from H2O:\n")
    #bigcat.summary()

    # Train H2O GBM Model:
    #Log.info("H2O GBM with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
    model = h2o.gbm(x=bigcat[["X"]],
                    y=bigcat["y"],
                    loss="bernoulli",
                    ntrees=1,
                    max_depth=1,
                    nbins=100)
    model.show()
    performance = model.model_performance(bigcat)
    performance.show()

    # Check AUC and overall prediction error
    #test_accuracy = performance.accuracy()
    test_auc = performance.auc()
Exemplo n.º 34
0
def trim_check(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    frame = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_trim.csv"))

    # single column (frame)
    trimmed_frame = frame["name"].trim()
    assert trimmed_frame[0, 0] == "AMC Ambassador Brougham", "Expected 'AMC Ambassador Brougham', but got {}".format(
        trimmed_frame[0, 0]
    )
    assert trimmed_frame[1, 0] == "AMC Ambassador DPL", "Expected 'AMC Ambassador DPL', but got {}".format(
        trimmed_frame[1, 0]
    )
    assert trimmed_frame[2, 0] == "AMC Ambassador SST", "Expected 'AMC Ambassador SST', but got {}".format(
        trimmed_frame[2, 0]
    )

    # single column (vec)
    vec = frame["name"]
    trimmed_vec = vec.trim()
    assert trimmed_vec[0, 0] == "AMC Ambassador Brougham", "Expected 'AMC Ambassador Brougham', but got {}".format(
        trimmed_frame[0, 0]
    )
    assert trimmed_vec[1, 0] == "AMC Ambassador DPL", "Expected 'AMC Ambassador DPL', but got {}".format(
        trimmed_frame[1, 0]
    )
    assert trimmed_vec[2, 0] == "AMC Ambassador SST", "Expected 'AMC Ambassador SST', but got {}".format(
        trimmed_frame[2, 0]
    )
Exemplo n.º 35
0
def sdev(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris_h2o = h2o.import_frame(
        path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"),
                            delimiter=',',
                            skip_header=1,
                            usecols=(0, 1, 2, 3))

    sd_np = np.std(iris_np, axis=0, ddof=1)
    for i in range(4):
        sd_h2o = iris_h2o[i].sd()
        assert abs(sd_np[i] - sd_h2o.eager()
                   ) < 1e-10, "expected standard deviations to be the same"

    try:
        iris_h2o[4].sd().eager()
        assert False, "expected an error. column is categorical."
    except EnvironmentError:
        assert True

    try:
        iris_h2o[0:2].sd().eager()
        assert False, "expected an error. more than one column."
    except AttributeError:
        assert True
def link_functions_gaussian(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    print("Read in prostate data.")
    h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(
        zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")
    ).as_matrix()
    sm_data_response = sm_data[:, 9]
    sm_data_features = sm_data[:, 1:9]

    print("Testing for family: GAUSSIAN")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    print("Create models with canonical link: IDENTITY")
    h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity", alpha=[0.5], Lambda=[0])
    sm_model = sm.GLM(
        endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian(sm.families.links.identity)
    ).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
    sm_deviance = sm_model.deviance / sm_model.null_deviance
    assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def iris_h2o_vs_sciKmeans(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
    iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"),
                             delimiter=',')
    iris_sci = iris_sci[:, 0:4]

    s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]]

    start = h2o.H2OFrame(s)
    start_key = start.send_frame()

    h2o_km = h2o.kmeans(x=iris_h2o[0:4],
                        k=3,
                        user_points=start_key,
                        standardize=False)

    sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
    sci_km.fit(iris_sci)

    # Log.info("Cluster centers from H2O:")
    print "Cluster centers from H2O:"
    h2o_centers = h2o_km.centers()
    print h2o_centers

    # Log.info("Cluster centers from scikit:")
    print "Cluster centers from scikit:"
    sci_centers = sci_km.cluster_centers_.tolist()
    print sci_centers

    for hcenter, scenter in zip(h2o_centers, sci_centers):
        for hpoint, spoint in zip(hcenter, scenter):
            assert (hpoint - spoint) < 1e-10, "expected centers to be the same"
Exemplo n.º 38
0
def expr_show(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    print "iris:"
    iris.show()

    ###################################################################

    # expr[int], expr._data is pending
    res = 2 - iris
    res2 = res[0]
    print "res2:"
    res2.show()

    # expr[int], expr._data is remote
    res3 = res[0]
    print "res3:"
    res3.show()

    # expr[int], expr._data is local
    expr = Expr([1,2,3])
    print "expr:"
    expr.show()

    # expr[tuple], expr._data is local
    expr = Expr([[1,2,3], [4,5,6]])
    print "expr:"
    expr.show()
Exemplo n.º 39
0
def group_by(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    h2o_iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv"))

    h2o_agg_funcs = ["count","count_unique","first","last","min","max","mean","avg","sd","stdev","var","sum","ss"]
    na_handling = ["ignore","rm","all"]
    col_names = h2o_iris.col_names()[0:4]

    # smoke test
    for a in h2o_agg_funcs:
       for n in na_handling:
           for c in col_names:
               h2o.group_by(h2o_iris, ["class"], {"foo":[a,c,n]})

    # h2o/pandas/numpy comparison test
    h2o_np_agg_dict = {"min":np.min, "max":np.max, "mean":np.mean, "sum":np.sum}
    for k in h2o_np_agg_dict.keys():
        for c in col_names:
            h2o_res = h2o.group_by(h2o_iris, ["class"], {"foo":[k,c,"all"]})
            pd_res = pd_iris.groupby("class")[c].aggregate(h2o_np_agg_dict[k])
            for i in range(3):
                h2o_val = h2o_res[i,1]
                pd_val = pd_res[h2o_res[i,0]]
                assert abs(h2o_val - pd_val) < 1e-06, \
                    "check unsuccessful! h2o computed {0} and pandas computed {1}. expected equal aggregate {2} " \
                    "values between h2o and pandas on column {3}".format(h2o_val,pd_val,k,c)
Exemplo n.º 40
0
def group_by(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    h2o_iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv"))

    h2o_agg_funcs = ["count","count_unique","first","last","min","max","mean","avg","sd","stdev","var","sum","ss"]
    na_handling = ["ignore","rm","all"]
    col_names = h2o_iris.col_names()[0:4]

    print "Running smoke test"

    # smoke test
    for a in h2o_agg_funcs:
       for n in na_handling:
           for c in col_names:
               print "group by : " + str(a) + "; " + str(n) + "; " + str(c)
               h2o.group_by(h2o_iris, ["class"], {"foo":[a,c,n]})

    # h2o/pandas/numpy comparison test
    h2o_np_agg_dict = {"min":np.min, "max":np.max, "mean":np.mean, "sum":np.sum}
    for k in h2o_np_agg_dict.keys():
        for c in col_names:
            print "group by comparison: " + str(k) + "; " + str(c)
            h2o_res = h2o.group_by(h2o_iris, ["class"], {"foo":[k,c,"all"]})
            pd_res = pd_iris.groupby("class")[c].aggregate(h2o_np_agg_dict[k])
            for i in range(3):
                h2o_val = h2o_res[i,1]
                pd_val = pd_res[h2o_res[i,0]]
                assert abs(h2o_val - pd_val) < 1e-06, \
                    "check unsuccessful! h2o computed {0} and pandas computed {1}. expected equal aggregate {2} " \
                    "values between h2o and pandas on column {3}".format(h2o_val,pd_val,k,c)
Exemplo n.º 41
0
def benignKmeans(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    #  Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_frame(
        path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"),
                               delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))
    for i in range(1, 7):
        benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i)
        print "H2O centers"
        print benign_h2o_km.centers()

        benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1)
        benign_sci_km.fit(benign_sci)
        print "sckit centers"
        print benign_sci_km.cluster_centers_
def covtype_get_model(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    #Log.info("Importing covtype.20k.data...\n")
    covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data"))

    Y = 54
    X = range(0,20) + range(29,54)

    # Set response to be indicator of a particular class
    res_class = random.randint(1,4)
    # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
    covtype[54] = (covtype[54] == res_class)

    #covtype_data.summary()

    # L2: alpha = 0, lambda = 0
    covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0])
    covtype_mod1.show()
    covtype_mod1 = h2o.get_model(covtype_mod1._id)
    covtype_mod1.show()

    # Elastic: alpha = 0.5, lambda = 1e-4
    covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4])
    covtype_mod2.show()
    covtype_mod2 = h2o.get_model(covtype_mod2._id)
    covtype_mod2.show()

    # L1: alpha = 1, lambda = 1e-4
    covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4])
    covtype_mod3.show()
    covtype_mod3 = h2o.get_model(covtype_mod3._id)
    covtype_mod3.show()
Exemplo n.º 43
0
def benign(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    training_data = h2o.import_frame(h2o.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = range(3) + range(4, 11)

    #Log.info("Build the model")
    model = h2o.glm(y=training_data[Y].asfactor(),
                    x=training_data[X],
                    family="binomial",
                    alpha=[0],
                    Lambda=[1e-5])

    #Log.info("Check that the columns used in the model are the ones we passed in.")
    #Log.info("===================Columns passed in: ================")
    in_names = [training_data.names()[i] for i in X]
    #Log.info("===================Columns passed out: ================")
    out_names = [
        model._model_json['output']['coefficients_table'].cell_values[c][0]
        for c in range(len(X) + 1)
    ]
    assert in_names == out_names[1:]
Exemplo n.º 44
0
def link_functions_gamma(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    print("Read in prostate data.")
    h2o_data = h2o.import_frame(
        path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open(
                "prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 5]
    sm_data_features = sm_data[:, [1, 2, 3, 4, 6, 7, 8, 9]]

    print("Testing for family: GAMMA")
    print("Set variables for h2o.")
    myY = "DPROS"
    myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "CAPSULE"]

    print("Create models with canonical link: INVERSE")
    h2o_model_in = h2o.glm(x=h2o_data[myX],
                           y=h2o_data[myY],
                           family="gamma",
                           link="inverse",
                           alpha=[0.5],
                           Lambda=[0],
                           n_folds=0)
    sm_model_in = sm.GLM(endog=sm_data_response,
                         exog=sm_data_features,
                         family=sm.families.Gamma(
                             sm.families.links.inverse_power)).fit()

    print("Compare model deviances for link function inverse")
    h2o_deviance_in = h2o_model_in._model_json['output'][
        'residual_deviance'] / h2o_model_in._model_json['output'][
            'null_deviance']
    sm_deviance_in = sm_model_in.deviance / sm_model_in.null_deviance
    assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures"

    print("Create models with canonical link: LOG")
    h2o_model_log = h2o.glm(x=h2o_data[myX],
                            y=h2o_data[myY],
                            family="gamma",
                            link="log",
                            alpha=[0.5],
                            Lambda=[0],
                            n_folds=0)
    sm_model_log = sm.GLM(endog=sm_data_response,
                          exog=sm_data_features,
                          family=sm.families.Gamma(
                              sm.families.links.log)).fit()

    print("Compare model deviances for link function log")
    h2o_deviance_log = h2o_model_log._model_json['output'][
        'residual_deviance'] / h2o_model_log._model_json['output'][
            'null_deviance']
    sm_deviance_log = sm_model_log.deviance / sm_model_log.null_deviance
    assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
Exemplo n.º 45
0
def slicing_shape(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))
    rows, cols = prostate.dim()

    #foo = prostate[0:0] # TODO: empty frame allowed?
    #foo.show()

    # prostate[slice]
    for ncols in range(1,cols+1):
        r, c = prostate[0:ncols].dim()
        assert r == rows, "incorrect number of rows. correct: {0}, computed: {1}".format(rows, r)
        assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(ncols, c)

    # prostate[int,slice]
    for ncols in range(1,cols+1):
        r, c = prostate[random.randint(0,rows-1),0:ncols].dim()
        assert r == 1, "incorrect number of rows. correct: {0}, computed: {1}".format(1, r)
        assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(ncols, c)

    # prostate[slice,int] # TODO: there's a bug here: HEXDEV-266
    for nrows in range(1,10):
       r, c = prostate[0:nrows,random.randint(0,cols-1)].dim()
       assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format(nrows, r)
       assert c == 1, "incorrect number of cols. correct: {0}, computed: {1}".format(1, c)

    # prostate[slice,slice] # TODO: there's a bug here: HEXDEV-266
    for nrows in range(1,10):
       for ncols in range(1,cols+1):
           r, c = prostate[0:nrows,0:ncols].dim()
           assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format(nrows, r)
           assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(ncols, c)
Exemplo n.º 46
0
def center_scale(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris =  h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))[0:4]

    # frame (default args)
    foo = iris.scale()
    # TODO: the below assertion fails. Should it?
    #assert abs(foo[0,0] - -0.8976739) < 1e-6 and  abs(foo[0,1] - 1.01560199) < 1e-6 and abs(foo[0,2] - -1.335752) < 1e-6 \
    #       and abs(foo[0,3] - -1.311052) < 1e-6, "h2o differed from r. h2o got {0}, {1}, {2}, and {3}" \
    #                                             "".format(foo[0,0],foo[0,1],foo[0,2],foo[0,3])

    # frame (centers=True, scale=False)
    foo = iris.scale(center=True, scale=False)

    # frame (centers=False, scale=True)
    foo = iris.scale(center=False, scale=True)

    # frame (centers=False, scale=False)
    foo = iris.scale(center=False, scale=False)

    # vec (default args)
    foo = iris[0].scale()

    # vec (centers=True, scale=False)
    foo = iris[1].scale(center=True, scale=False)

    # vec (centers=False, scale=True)
    foo = iris[2].scale(center=False, scale=True)

    # vec (centers=False, scale=False)
    foo = iris[3].scale(center=False, scale=False)
def deep_learning_metrics_test(ip, port):
    h2o.init(ip, port)  # connect to existing cluster

    df = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    df.drop("ID")  # remove ID
    df["CAPSULE"] = df["CAPSULE"].asfactor()  # make CAPSULE categorical
    vol = df["VOL"]
    vol[vol == 0] = float("nan")  # 0 VOL means 'missing'

    r = vol.runif()  # random train/test split
    train = df[r < 0.8]
    test = df[r >= 0.8]

    # See that the data is ready
    train.describe()
    train.head()
    train.tail()
    test.describe()
    test.head()
    test.tail()

    # Run DeepLearning
    print "Train a Deeplearning model: "
    dl = h2o.deeplearning(x=train[1:], y=train["CAPSULE"], epochs=100, hidden=[10, 10, 10], loss="CrossEntropy")
    print "Binomial Model Metrics: "
    print
    dl.show()
    dl.model_performance(test).show()
Exemplo n.º 48
0
def offset_tweedie(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    insurance = h2o.import_frame(
        h2o.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3],
                  y=insurance["Claims"],
                  distribution="tweedie",
                  ntrees=600,
                  max_depth=1,
                  min_rows=1,
                  learn_rate=.1,
                  offset_column="offset",
                  training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from harrysouthworth's gbm:
    #	fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,shrinkage = .1,bag.fraction = 1,train.fraction = 1,
    #           data = Insurance, distribution ="tweedie", n.trees = 600)
    #	pr = predict(fit2, Insurance)
    #	pr = exp(pr+log(Insurance$Holders))
    assert abs(-1.869702 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}".\
        format(-1.869702, gbm._model_json['output']['init_f'])
    assert abs(49.21591 - predictions.mean()) < 1e-4, "expected prediction mean to be {0}, but got {1}". \
        format(49.21591, predictions.mean())
    assert abs(1.0258 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \
        format(1.0258, predictions.min())
    assert abs(392.4651 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \
        format(392.4651, predictions.max())
Exemplo n.º 49
0
def smallcatGBM(ip,port):
  # Training set has 26 categories from A to Z
  # Categories A, C, E, G, ... are perfect predictors of y = 1
  # Categories B, D, F, H, ... are perfect predictors of y = 0

  # Connect to h2o
  h2o.init(ip,port)

  #Log.info("Importing alphabet_cattest.csv data...\n")
  alphabet = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"))
  alphabet["y"] = alphabet["y"].asfactor()
  #Log.info("Summary of alphabet_cattest.csv from H2O:\n")
  #alphabet.summary()

  # Prepare data for scikit use
  trainData = np.loadtxt(h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1,
                         converters={0:lambda s: ord(s.split("\"")[1])})
  trainDataResponse = trainData[:,1]
  trainDataFeatures = trainData[:,0]
  
  # Train H2O GBM Model:
  #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
  gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100)
  gbm_h2o.show()
  
  # Train scikit GBM Model:
  # Log.info("scikit GBM with same parameters:")
  gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None)
  gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
Exemplo n.º 50
0
def expr_show(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    print "iris:"
    iris.show()

    ###################################################################

    # expr[int], expr._data is pending
    res = 2 - iris
    res2 = res[0]
    print "res2:"
    res2.show()

    # expr[int], expr._data is remote
    res3 = res[0]
    print "res3:"
    res3.show()

    # expr[int], expr._data is local
    expr = Expr([1, 2, 3])
    print "expr:"
    expr.show()

    # expr[tuple], expr._data is local
    expr = Expr([[1, 2, 3], [4, 5, 6]])
    print "expr:"
    expr.show()
Exemplo n.º 51
0
def table_check(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    # single column (frame)
    table1 = h2o.table(iris[["C5"]])
    assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1])
    assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1])
    assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1])

    # single column (vec)
    table1 = h2o.table(iris["C5"])
    assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1])
    assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1])
    assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1])

    # two-column (one argument)
    table2 = h2o.table(iris[["C1", "C5"]])
    assert table2[0,2] == 4, "Expected , but got {0}".format(table2[0,2])
    assert table2[1,2] == 5, "Expected , but got {0}".format(table2[1,2])
    assert table2[2,2] == 3, "Expected , but got {0}".format(table2[2,2])

    # two columns (seperate arguments (frames))
    table3 = h2o.table(iris[["C1"]],iris[["C5"]])
    assert table3[0,2] == 4, "Expected , but got {0}".format(table3[0,2])
    assert table3[1,2] == 5, "Expected , but got {0}".format(table3[1,2])
    assert table3[2,2] == 3, "Expected , but got {0}".format(table3[2,2])

    # two columns (seperate arguments (vecs))
    table3 = h2o.table(iris["C1"],iris["C5"])
    assert table3[0,2] == 4, "Expected , but got {0}".format(table3[0,2])
    assert table3[1,2] == 5, "Expected , but got {0}".format(table3[1,2])
    assert table3[2,2] == 3, "Expected , but got {0}".format(table3[2,2])
def link_functions_binomial(ip,port):
	# Connect to h2o
	h2o.init(ip,port)

	print("Read in prostate data.")
	h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
	h2o_data.head()

	sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
	sm_data_response = sm_data[:,2]
	sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]]

	print("Testing for family: BINOMIAL")
	print("Set variables for h2o.")
	myY = "CAPSULE"
	myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"]

	print("Create models with canonical link: LOGIT")
	h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0])
	sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit()

	print("Compare model deviances for link function logit")
	h2o_deviance = h2o_model._model_json['output']['residual_deviance'] / h2o_model._model_json['output']['null_deviance']
	sm_deviance = sm_model.deviance / sm_model.null_deviance
	assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
Exemplo n.º 53
0
def sdev(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
  iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"),
                          delimiter=',',
                          skip_header=1,
                          usecols=(0, 1, 2, 3))

  sd_np = np.std(iris_np, axis=0, ddof=1)
  for i in range(4):
    sd_h2o = iris_h2o[i].sd()
    assert abs(sd_np[i] - sd_h2o) < 1e-10, "expected standard deviations to be the same"

  try:
    iris_h2o[4].sd()
    assert False, "expected an error. column is categorical."
  except EnvironmentError:
    assert True

  try:
    iris_h2o[0:2].sd()
    assert False, "expected an error. more than one column."
  except EnvironmentError:
    assert True
Exemplo n.º 54
0
def https_import(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    url = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip"
    aa = h2o.import_frame(path=url)
    aa.show()
Exemplo n.º 55
0
def swpredsRF(ip,port):
    # Training set has two predictor columns
    # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise
    # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors)

    # Connect to h2o
    h2o.init(ip,port)

    #Log.info("Importing swpreds_1000x3.csv data...\n")
    swpreds = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/swpreds_1000x3.csv"))
    swpreds["y"] = swpreds["y"].asfactor()

    #Log.info("Summary of swpreds_1000x3.csv from H2O:\n")
    #swpreds.summary()

    # Train H2O DRF without Noise Column
    #Log.info("Distributed Random Forest with only Predictor Column")
    model1 = h2o.random_forest(x=swpreds[["X1"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500)
    model1.show()
    perf1 = model1.model_performance(swpreds)
    print(perf1.auc())

    # Train H2O DRF Model including Noise Column:
    #Log.info("Distributed Random Forest including Noise Column")
    model2 = h2o.random_forest(x=swpreds[["X1","X2"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500)
    model2.show()
    perf2 = model2.model_performance(swpreds)
    print(perf2.auc())
Exemplo n.º 56
0
def ascharacter(ip,port):
    h2oframe =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv"))
    h2oframe.show()
    h2oframe['cylinders'] = h2oframe['cylinders'].asfactor()
    h2oframe['cylinders'].ascharacter()
    assert h2oframe["cylinders"].isfactor(), "expected the column be a factor"
    assert not h2oframe["cylinders"].isstring(), "expected the column to not be a string"
def perfectSeparation_balanced(ip, port):

    # Connect to h2o
    h2o.init(ip, port)

    print("Read in synthetic balanced dataset")
    data = h2o.import_frame(
        path=h2o.locate("smalldata/synthetic_perfect_separation/balanced.csv"))

    print("Fit model on dataset")
    model = h2o.glm(x=data[["x1", "x2"]],
                    y=data["y"],
                    family="binomial",
                    lambda_search=True,
                    use_all_factor_levels=True,
                    alpha=[0.5],
                    Lambda=[0])

    print(
        "Extract models' coefficients and assert reasonable values (ie. no greater than 50)"
    )
    print("Balanced dataset")
    coef = [
        c[1]
        for c in model._model_json['output']['coefficients_table'].cell_values
        if c[0] != "Intercept"
    ]
    for c in coef:
        assert c < 50, "coefficient is too large"