Exemplo n.º 1
0
    def attack(train, x):
        kwargs = {}

        # randomly select parameters and their corresponding values
        kwargs['k'] = random.randint(1,20)
        if random.randint(0,1): kwargs['model_id'] = "my_model"
        if random.randint(0,1): kwargs['max_iterations'] = random.randint(1,1000)
        if random.randint(0,1): kwargs['standardize'] = [True, False][random.randint(0,1)]
        if random.randint(0,1):
            method = random.randint(0,3)
            if method == 3:
                s = []
                for p in range(kwargs['k']):
                    s.append([random.uniform(train[c].mean()-100,train[c].mean()+100) for c in x])
                start = h2o.H2OFrame(python_obj=s)
                kwargs['user_points'] = start
            else:
                kwargs['init'] = ["Furthest","Random", "PlusPlus"][method]
        if random.randint(0,1): kwargs['seed'] = random.randint(1,10000)

        # display the parameters and their corresponding values
        print "-----------------------"
        print "x: {0}".format(x)
        for k, v in zip(kwargs.keys(), kwargs.values()):
            if k == 'user_points':
                print k + ": "
                start.show()
            else:
                print k + ": {0}".format(v)
        h2o.kmeans(x=train[x],  **kwargs)
        print "-----------------------"
Exemplo n.º 2
0
def parametersKmeans(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    #Log.info("Getting data...")
    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    #Log.info("Create and and duplicate...")
    iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234)
    parameters = iris_km._model_json['parameters']
    param_dict = {}
    for p in range(len(parameters)):
        param_dict[parameters[p]['label']] = parameters[p]['actual_value']

    iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict)

    #Log.info("wss")
    wss = iris_km.withinss().sort()
    wss_again = iris_km_again.withinss().sort()
    assert wss == wss_again, "expected wss to be equal"

    #Log.info("centers")
    centers = iris_km.centers()
    centers_again = iris_km_again.centers()
    assert centers == centers_again, "expected centers to be the same"
    def attack(train, x):
        kwargs = {}

        # randomly select parameters and their corresponding values
        kwargs['k'] = random.randint(1,20)
        if random.randint(0,1): kwargs['model_id'] = "my_model"
        if random.randint(0,1): kwargs['max_iterations'] = random.randint(1,1000)
        if random.randint(0,1): kwargs['standardize'] = [True, False][random.randint(0,1)]
        if random.randint(0,1):
            method = random.randint(0,3)
            if method == 3:
                s = []
                for p in range(kwargs['k']):
                    s.append([random.uniform(train[c].mean()-100,train[c].mean()+100) for c in x])
                start = h2o.H2OFrame(python_obj=s)
                kwargs['user_points'] = start
            else:
                kwargs['init'] = ["Furthest","Random", "PlusPlus"][method]
        if random.randint(0,1): kwargs['seed'] = random.randint(1,10000)

        # display the parameters and their corresponding values
        print "-----------------------"
        print "x: {0}".format(x)
        for k, v in zip(kwargs.keys(), kwargs.values()):
            if k == 'user_points':
                print k + ": "
                start.show()
            else:
                print k + ": {0}".format(v)
        h2o.kmeans(x=train[x],  **kwargs)
        print "-----------------------"
Exemplo n.º 4
0
def parametersKmeans(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)  # connect to localhost:54321

    #Log.info("Getting data...")
    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    #Log.info("Create and and duplicate...")
    iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234)
    parameters = iris_km._model_json['parameters']
    param_dict = {}
    for p in range(len(parameters)):
        param_dict[parameters[p]['label']] = parameters[p]['actual_value']

    iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict)

    #Log.info("wss")
    wss = iris_km.withinss().sort()
    wss_again = iris_km_again.withinss().sort()
    assert wss == wss_again, "expected wss to be equal"

    #Log.info("centers")
    centers = iris_km.centers()
    centers_again = iris_km_again.centers()
    assert centers == centers_again, "expected centers to be the same"
Exemplo n.º 5
0
def convergeKmeans(ip, port):

    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    # Log.info("Importing ozone.csv data...\n")
    ozone_h2o = h2o.import_frame(
        path=h2o.locate("smalldata/glm_test/ozone.csv"))
    #ozone_h2o.summary()

    miters = 5
    ncent = 10

    # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1"))
    # TODO: implement row slicing
    start = h2o.H2OFrame([[41, 190, 67, 7.4], [36, 118, 72, 8],
                          [12, 149, 74, 12.6], [18, 313, 62, 11.5],
                          [23, 299, 65, 8.6], [19, 99, 59, 13.8],
                          [8, 19, 61, 20.1], [16, 256, 69, 9.7],
                          [11, 290, 66, 9.2], [14, 274, 68, 10.9]])
    start_key = start.send_frame()

    # expect error for 0 iterations
    try:
        h2o.kmeans(x=ozone_h2o, max_iterations=0)
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    centers_key = start_key
    for i in range(miters):
        rep_fit = h2o.kmeans(x=ozone_h2o,
                             k=ncent,
                             user_points=centers_key,
                             max_iterations=1)
        centers = h2o.H2OFrame(rep_fit.centers())
        centers_key = centers.send_frame()

    # Log.info(paste("Run k-means with max_iter=miters"))
    all_fit = h2o.kmeans(x=ozone_h2o,
                         k=ncent,
                         user_points=start_key,
                         max_iterations=miters)
    assert rep_fit.centers() == all_fit.centers(
    ), "expected the centers to be the same"

    # Log.info("Check cluster centers have converged")
    all_fit2 = h2o.kmeans(x=ozone_h2o,
                          k=ncent,
                          user_points=h2o.H2OFrame(
                              all_fit.centers()).send_frame(),
                          max_iterations=1)
    avg_change = sum([
        sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)])
        for c1, c2 in zip(all_fit.centers(), all_fit2.centers())
    ]) / ncent
    assert avg_change < 1e-6 or all_fit._model_json['output'][
        'iterations'] < miters
Exemplo n.º 6
0
def convergeKmeans(ip, port):

    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    # Log.info("Importing ozone.csv data...\n")
    ozone_h2o = h2o.import_frame(
        path=h2o.locate("smalldata/glm_test/ozone.csv"))
    #ozone_h2o.summary()

    miters = 5
    ncent = 10

    # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1"))
    start = ozone_h2o[0:10, 0:4]

    # expect error for 0 iterations
    try:
        h2o.kmeans(x=ozone_h2o, max_iterations=0)
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    centers_key = start.eager()
    for i in range(miters):
        rep_fit = h2o.kmeans(x=ozone_h2o,
                             k=ncent,
                             user_points=centers_key,
                             max_iterations=1)
        centers = h2o.H2OFrame(rep_fit.centers())
        centers_key = centers.send_frame()

    # Log.info(paste("Run k-means with max_iter=miters"))
    all_fit = h2o.kmeans(x=ozone_h2o,
                         k=ncent,
                         user_points=start.eager(),
                         max_iterations=miters)
    assert rep_fit.centers() == all_fit.centers(
    ), "expected the centers to be the same"

    # Log.info("Check cluster centers have converged")
    all_fit2 = h2o.kmeans(x=ozone_h2o,
                          k=ncent,
                          user_points=h2o.H2OFrame(
                              all_fit.centers()).send_frame(),
                          max_iterations=1)
    avg_change = sum([
        sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)])
        for c1, c2 in zip(all_fit.centers(), all_fit2.centers())
    ]) / ncent
    assert avg_change < 1e-6 or all_fit._model_json['output'][
        'iterations'] < miters
def hdfs_kmeans_converge():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print("Import BigCross.data from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow
        print("rows: {0}".format(n))
        ncent = 3
        miters = 10

        print("Run k-means with k = {0} and max_iterations = {1}".format(
            ncent, miters))
        cross1_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               max_iterations=miters)
        print(cross1_km)

        print(
            "Run k-means with init = final cluster centers and max_iterations = 1"
        )
        init_centers = h2o.H2OFrame(cross1_km.centers())
        cross2_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               user_points=init_centers,
                               max_iterations=1)
        print(cross2_km)

        print("Check k-means converged or maximum iterations reached")
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = old_div(((c1 - c2)**2).sum(), ncent)
        iters = cross1_km._model_json['output']['model_summary'].cell_values[
            0][3]
        assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \
                                                    "{0} and iterations = {1}".format(avg_change, iters)
    else:
        raise EnvironmentError
def iris_h2o_vs_sciKmeans(ip,port):
  # Connect to a pre-existing cluster
  h2o.init(ip,port)  # connect to localhost:54321

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
  iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',')
  iris_sci = iris_sci[:,0:4]

  s =[[4.9,3.0,1.4,0.2],
  [5.6,2.5,3.9,1.1],
  [6.5,3.0,5.2,2.0]]

  start = h2o.H2OFrame(s)
  start_key = start.send_frame()

  h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False)

  sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
  sci_km.fit(iris_sci)

  # Log.info("Cluster centers from H2O:")
  print "Cluster centers from H2O:"
  h2o_centers = h2o_km.centers()
  print h2o_centers

  # Log.info("Cluster centers from scikit:")
  print "Cluster centers from scikit:"
  sci_centers = sci_km.cluster_centers_.tolist()
  print sci_centers

  for hcenter, scenter in zip(h2o_centers, sci_centers):
    for hpoint, spoint in zip(hcenter,scenter):
      assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
Exemplo n.º 9
0
def benignKmeans(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    #  Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_frame(
        path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"),
                               delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))
    for i in range(1, 7):
        benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i)
        print "H2O centers"
        print benign_h2o_km.centers()

        benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1)
        benign_sci_km.fit(benign_sci)
        print "sckit centers"
        print benign_sci_km.cluster_centers_
Exemplo n.º 10
0
def pyunit_model_params():

  pros = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))

  m = h2o.kmeans(pros,k=4)
  print m.params
  print m.full_parameters
def getModelKmeans(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    #Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_frame(
        path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"),
                               delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = h2o.kmeans(x=benign_h2o, k=i)
        km_h2o.show()
        #TODO: impement h2o.getModel()
        model = h2o.getModel(km_h2o._key)
        model.show()

        km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
Exemplo n.º 12
0
def pyunit_model_params(ip, port):

    pros = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))

    m = h2o.kmeans(pros, k=4)
    print m.params
    print m.full_parameters
Exemplo n.º 13
0
def iris_h2o_vs_sciKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    iris_h2o = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv"))
    iris_sci = np.genfromtxt(tests.locate("smalldata/iris/iris.csv"),
                             delimiter=',')
    iris_sci = iris_sci[:, 0:4]

    s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]]

    start = h2o.H2OFrame(s)

    h2o_km = h2o.kmeans(x=iris_h2o[0:4],
                        k=3,
                        user_points=start,
                        standardize=False)

    sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
    sci_km.fit(iris_sci)

    # Log.info("Cluster centers from H2O:")
    print "Cluster centers from H2O:"
    h2o_centers = h2o_km.centers()
    print h2o_centers

    # Log.info("Cluster centers from scikit:")
    print "Cluster centers from scikit:"
    sci_centers = sci_km.cluster_centers_.tolist()
    print sci_centers

    for hcenter, scenter in zip(h2o_centers, sci_centers):
        for hpoint, spoint in zip(hcenter, scenter):
            assert (hpoint - spoint) < 1e-10, "expected centers to be the same"
def hdfs_kmeans_airlines():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_file = "/datasets/airlines_all.csv"

        print "Import airlines_all.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_file(url)
        n = airlines_h2o.nrow
        print "rows: {0}".format(n)

        print "Run k-means++ with k = 7 and max_iterations = 10"
        myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9]
        airlines_km = h2o.kmeans(training_frame=airlines_h2o,
                                 x=airlines_h2o[myX],
                                 k=7,
                                 init="Furthest",
                                 max_iterations=10,
                                 standardize=True)
        print airlines_km
    else:
        raise (EnvironmentError,
               "Not running on H2O internal network.  No access to HDFS.")
Exemplo n.º 15
0
def hdfs_kmeans_airlines(ip, port):

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_file = "/datasets/airlines_all.csv"

        print "Import airlines_all.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_file(url)
        n = airlines_h2o.nrow
        print "rows: {0}".format(n)

        print "Run k-means++ with k = 7 and max_iterations = 10"
        myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9]
        airlines_km = h2o.kmeans(training_frame=airlines_h2o,
                                 x=airlines_h2o[myX],
                                 k=7,
                                 init="Furthest",
                                 max_iterations=10,
                                 standardize=True)
        print airlines_km
    else:
        print "Not running on H2O internal network.  No access to HDFS."
def getModelKmeans(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)  # connect to localhost:54321

    #Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2,7):
        # Log.info("H2O K-Means")
        km_h2o = h2o.kmeans(x=benign_h2o, k=i)
        km_h2o.show()
        #TODO: impement h2o.getModel()
        model = h2o.getModel(km_h2o._key)
        model.show()

        km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
Exemplo n.º 17
0
def convergeKmeans(ip,port):

  # Connect to a pre-existing cluster
  h2o.init(ip,port)  # connect to localhost:54321

  # Log.info("Importing ozone.csv data...\n")
  ozone_h2o = h2o.import_frame(path=h2o.locate("smalldata/glm_test/ozone.csv"))
  #ozone_h2o.summary()

  miters = 5
  ncent = 10

  # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1"))
  # TODO: implement row slicing
  start = h2o.H2OFrame([[41,190,67,7.4],
  [36,118,72,8],
  [12,149,74,12.6],
  [18,313,62,11.5],
  [23,299,65,8.6],
  [19,99,59,13.8],
  [8,19,61,20.1],
  [16,256,69,9.7],
  [11,290,66,9.2],
  [14,274,68,10.9]])
  start_key = start.send_frame()

  # expect error for 0 iterations
  try:
    h2o.kmeans(x=ozone_h2o, max_iterations=0)
    assert False, "expected an error"
  except EnvironmentError:
    assert True

  centers_key = start_key
  for i in range(miters):
    rep_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=centers_key, max_iterations=1)
    centers = h2o.H2OFrame(rep_fit.centers())
    centers_key = centers.send_frame()

  # Log.info(paste("Run k-means with max_iter=miters"))
  all_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=start_key, max_iterations=miters)
  assert rep_fit.centers() == all_fit.centers(), "expected the centers to be the same"

  # Log.info("Check cluster centers have converged")
  all_fit2 = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=h2o.H2OFrame(all_fit.centers()).send_frame(), max_iterations=1)
  avg_change = sum([sum([pow((e1 - e2),2) for e1, e2 in zip(c1,c2)]) for c1, c2 in zip(all_fit.centers(),all_fit2.centers())])/ncent
  assert avg_change < 1e-6 or all_fit._model_json['output']['iterations'] < miters
Exemplo n.º 18
0
def get_model_test(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    # Regression
    regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian")
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._key)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli")
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv"))
    km_h2o = h2o.kmeans(x=benign_h2o, k=3)
    benign_km = h2o.get_model(km_h2o._key)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy')
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._key)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
Exemplo n.º 19
0
def hdfs_kmeans_converge():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow
        print "rows: {0}".format(n)
        ncent = 3
        miters = 10

        print "Run k-means with k = {0} and max_iterations = {1}".format(
            ncent, miters)
        cross1_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               max_iterations=miters)
        print cross1_km

        print "Run k-means with init = final cluster centers and max_iterations = 1"
        init_centers = h2o.H2OFrame(cross1_km.centers())
        init_centers_key = init_centers.send_frame()
        cross2_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               user_points=init_centers_key,
                               max_iterations=1)
        print cross2_km

        print "Check k-means converged or maximum iterations reached"
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = ((c1 - c2)**2).sum() / ncent
        iters = cross1_km._model_json['output']['model_summary'].cell_values[
            0][3]
        assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \
                                                    "{0} and iterations = {1}".format(avg_change, iters)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 20
0
def pyunit_model_params():

    pros = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))

    m = h2o.kmeans(pros, k=4)
    print(m.params)
    print(m.full_parameters)
Exemplo n.º 21
0
def km_num_iterations(ip,port):
    # Connect to a pre-existing cluster
      # connect to localhost:54321

    prostate_h2o = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv"))

    prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=3, max_iterations=4)
    num_iterations = prostate_km_h2o.num_iterations()
    assert num_iterations <= 4, "Expected 4 iterations, but got {0}".format(num_iterations)
Exemplo n.º 22
0
def get_model_test():
    
    

    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.70]

    # Regression
    regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian")
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._id)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {}, but got {} and {}".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:], distribution="bernoulli")
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._id)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {}, but got {} and {}".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    km_h2o = h2o.kmeans(x=benign_h2o, k=3)
    benign_km = h2o.get_model(km_h2o._id)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy')
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._id)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
def km_num_iterations():
    # Connect to a pre-existing cluster
      # connect to localhost:54321

    prostate_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=3, max_iterations=4)
    num_iterations = prostate_km_h2o.num_iterations()
    assert num_iterations <= 4, "Expected 4 iterations, but got {0}".format(num_iterations)
Exemplo n.º 24
0
def hdfs_kmeans():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_covtype_file = "/datasets/runit/covtype.data"

        print("Import iris_wheader.csv from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        n = iris_h2o.nrow
        print("rows: {0}".format(n))
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 150)

        print("Running KMeans on iris")
        iris_km = h2o.kmeans(training_frame=iris_h2o,
                             k=3,
                             x=iris_h2o[0:4],
                             max_iterations=10)
        print(iris_km)

        print("Importing covtype.data from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
        covtype_h2o = h2o.import_file(url)
        n = covtype_h2o.nrow
        print("rows: {0}".format(n))
        assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 581012)

        print("Running KMeans on covtype")
        covtype_km = h2o.kmeans(training_frame=covtype_h2o,
                                x=covtype_h2o[0:55],
                                k=8,
                                max_iterations=10)
        print(covtype_km)

    else:
        raise EnvironmentError
Exemplo n.º 25
0
def hdfs_kmeans(ip, port):
    h2o.init(ip, port)

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = h2o.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_covtype_file = "/datasets/runit/covtype.data"

        print "Import iris_wheader.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_frame(url)
        n = iris_h2o.nrow()
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 150)

        print "Running KMeans on iris"
        iris_km = h2o.kmeans(training_frame=iris_h2o,
                             k=3,
                             x=iris_h2o[0:4],
                             max_iterations=10)
        print iris_km

        print "Importing covtype.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
        covtype_h2o = h2o.import_frame(url)
        n = covtype_h2o.nrow()
        print "rows: {0}".format(n)
        assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 581012)

        print "Running KMeans on covtype"
        covtype_km = h2o.kmeans(training_frame=covtype_h2o,
                                x=covtype_h2o[0:55],
                                k=8,
                                max_iterations=10)
        print covtype_km

    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 26
0
def km_num_iterations(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)  # connect to localhost:54321

    prostate_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=3, max_iterations=2)
    num_iterations = prostate_km_h2o.num_iterations()
    #TODO: is there and off-by-one error here?
    assert num_iterations <= 4, "Expected 4 iterations, but got {0}".format(num_iterations)
Exemplo n.º 27
0
def convergeKmeans(ip, port):

    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing ozone.csv data...\n")
    ozone_h2o = h2o.import_file(path=h2o.locate("smalldata/glm_test/ozone.csv"))
    # ozone_h2o.summary()

    miters = 5
    ncent = 10

    # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1"))
    start = ozone_h2o[0:10, 0:4]

    # expect error for 0 iterations
    try:
        h2o.kmeans(x=ozone_h2o, max_iterations=0)
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    centers = start
    for i in range(miters):
        rep_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=centers, max_iterations=1)
        centers = h2o.H2OFrame(rep_fit.centers())

    # Log.info(paste("Run k-means with max_iter=miters"))
    all_fit = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=start, max_iterations=miters)
    assert rep_fit.centers() == all_fit.centers(), "expected the centers to be the same"

    # Log.info("Check cluster centers have converged")
    all_fit2 = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=h2o.H2OFrame(all_fit.centers()), max_iterations=1)
    avg_change = (
        sum(
            [
                sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)])
                for c1, c2 in zip(all_fit.centers(), all_fit2.centers())
            ]
        )
        / ncent
    )
    assert avg_change < 1e-6 or all_fit._model_json["output"]["iterations"] == miters
Exemplo n.º 28
0
def km_num_iterations(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)  # connect to localhost:54321

    prostate_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=3, max_iterations=3)
    num_iterations = prostate_km_h2o.num_iterations()
    #TODO: is there and off-by-one error here?
    assert num_iterations == 4, "Expected 4 iterations, but got {0}".format(num_iterations)
def kmeans_mllib():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print("Import BigCross.data from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow

        err_mllib = np.genfromtxt(
            pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"),
            delimiter=",",
            skip_header=1)
        ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))]

        for k in ncent:
            print(
                "Run k-means++ with k = {0} and max_iterations = 10".format(k))
            cross_km = h2o.kmeans(training_frame=cross_h2o,
                                  x=cross_h2o,
                                  k=k,
                                  init="PlusPlus",
                                  max_iterations=10,
                                  standardize=False)

            clust_mllib = np.genfromtxt(
                pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" +
                                    str(k) + ".csv"),
                delimiter=",").tolist()
            clust_h2o = cross_km.centers()

            # Sort in ascending order by first dimension for comparison purposes
            clust_mllib.sort(key=lambda x: x[0])
            clust_h2o.sort(key=lambda x: x[0])

            print("\nMLlib Cluster Centers:\n")
            print(clust_mllib)
            print("\nH2O Cluster Centers:\n")
            print(clust_h2o)

            wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1]
            wcsse_h2o = old_div(cross_km.tot_withinss(), n)
            print("\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib))
            print("H2O Average Within-Cluster SSE: \n".format(wcsse_h2o))
            assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \
                                             "got {1}".format(wcsse_mllib, wcsse_h2o)
    else:
        raise EnvironmentError
def hdfs_kmeans_converge(ip, port):
    h2o.init(ip, port)

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = h2o.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_frame(url)
        n = cross_h2o.nrow()
        print "rows: {0}".format(n)
        ncent = 3
        miters = 10

        print "Run k-means with k = {0} and max_iterations = {1}".format(ncent, miters)
        cross1_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, max_iterations=miters)
        print cross1_km

        print "Run k-means with init = final cluster centers and max_iterations = 1"
        init_centers = h2o.H2OFrame(cross1_km.centers())
        init_centers_key = init_centers.send_frame()
        cross2_km = h2o.kmeans(
            training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, user_points=init_centers_key, max_iterations=1
        )
        print cross2_km

        print "Check k-means converged or maximum iterations reached"
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = ((c1 - c2) ** 2).sum() / ncent
        iters = cross1_km._model_json["output"]["model_summary"].cell_values[0][3]
        assert avg_change < 1e-6 or iters > miters, (
            "Expected k-means to converge or reach max iterations. avg_change = "
            "{0} and iterations = {1}".format(avg_change, iters)
        )
    else:
        print "Not running on H2O internal network.  No access to HDFS."
def hdfs_kmeans_converge():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print("Import BigCross.data from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow
        print("rows: {0}".format(n))
        ncent = 3
        miters = 10

        print("Run k-means with k = {0} and max_iterations = {1}".format(ncent,miters))
        cross1_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, max_iterations = miters)
        print(cross1_km)

        print("Run k-means with init = final cluster centers and max_iterations = 1")
        init_centers = h2o.H2OFrame(cross1_km.centers())
        cross2_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, user_points=init_centers,
                               max_iterations = 1)
        print(cross2_km)

        print("Check k-means converged or maximum iterations reached")
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = old_div(((c1-c2)**2).sum(), ncent)
        iters = cross1_km._model_json['output']['model_summary'].cell_values[0][3]
        assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \
                                                    "{0} and iterations = {1}".format(avg_change, iters)
    else:
        raise EnvironmentError
Exemplo n.º 32
0
def parametersKmeans(ip,port):

    print "Getting data..."
    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    print "Create and and duplicate..."
    iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234)
    parameters = iris_km._model_json['parameters']
    param_dict = {}
    for p in range(len(parameters)):
        param_dict[parameters[p]['label']] = parameters[p]['actual_value']

    iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict)

    print "wss"
    wss = iris_km.withinss().sort()
    wss_again = iris_km_again.withinss().sort()
    assert wss == wss_again, "expected wss to be equal"

    print "centers"
    centers = iris_km.centers()
    centers_again = iris_km_again.centers()
    assert centers == centers_again, "expected centers to be the same"
def kmeans_mllib():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow

        err_mllib = np.genfromtxt(
            pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1
        )
        ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))]

        for k in ncent:
            print "Run k-means++ with k = {0} and max_iterations = 10".format(k)
            cross_km = h2o.kmeans(
                training_frame=cross_h2o, x=cross_h2o, k=k, init="PlusPlus", max_iterations=10, standardize=False
            )

            clust_mllib = np.genfromtxt(
                pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter=","
            ).tolist()
            clust_h2o = cross_km.centers()

            # Sort in ascending order by first dimension for comparison purposes
            clust_mllib.sort(key=lambda x: x[0])
            clust_h2o.sort(key=lambda x: x[0])

            print "\nMLlib Cluster Centers:\n"
            print clust_mllib
            print "\nH2O Cluster Centers:\n"
            print clust_h2o

            wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1]
            wcsse_h2o = cross_km.tot_withinss() / n
            print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib)
            print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o)
            assert wcsse_h2o == wcsse_mllib, (
                "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O "
                "got {1}".format(wcsse_mllib, wcsse_h2o)
            )
    else:
        raise (EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
Exemplo n.º 34
0
def parametersKmeans():

    print "Getting data..."
    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    print "Create and and duplicate..."
    iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234)
    parameters = iris_km._model_json["parameters"]
    param_dict = {}
    for p in range(len(parameters)):
        param_dict[parameters[p]["label"]] = parameters[p]["actual_value"]

    iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict)

    print "wss"
    wss = iris_km.withinss().sort()
    wss_again = iris_km_again.withinss().sort()
    assert wss == wss_again, "expected wss to be equal"

    print "centers"
    centers = iris_km.centers()
    centers_again = iris_km_again.centers()
    assert centers == centers_again, "expected centers to be the same"
Exemplo n.º 35
0
def emptyclusKmeans(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    #Log.info("Importing ozone.csv data...\n")
    ozone_sci = np.loadtxt(h2o.locate("smalldata/glm_test/ozone.csv"),
                           delimiter=',',
                           skiprows=1)
    ozone_h2o = h2o.import_frame(
        path=h2o.locate("smalldata/glm_test/ozone.csv"))

    ncent = 10
    nempty = random.randint(1, ncent / 2)
    #TODO: implement row slicing
    initial_centers = [[41, 190, 67, 7.4], [36, 118, 72,
                                            8], [12, 149, 74, 12.6],
                       [18, 313, 62, 11.5], [23, 299, 65, 8.6],
                       [19, 99, 59, 13.8], [8, 19, 61, 20.1],
                       [16, 256, 69, 9.7], [11, 290, 66, 9.2],
                       [14, 274, 68, 10.9]]
    for i in random.sample(range(0, ncent - 1), nempty):
        initial_centers[i] = [
            100 * i for z in range(1,
                                   len(initial_centers[0]) + 1)
        ]

    initial_centers_h2o = h2o.H2OFrame(initial_centers)
    initial_centers_h2o_key = initial_centers_h2o.send_frame()
    initial_centers_sci = np.asarray(initial_centers)

    #Log.info("Initial cluster centers:")
    print "H2O initial centers:"
    initial_centers_h2o.show()
    print "scikit initial centers:"
    print initial_centers_sci

    # H2O can handle empty clusters and so can scikit
    #Log.info("Check that H2O can handle badly initialized centers")
    km_sci = KMeans(n_clusters=ncent, init=initial_centers_sci, n_init=1)
    km_sci.fit(preprocessing.scale(ozone_sci))
    print "scikit final centers"
    print km_sci.cluster_centers_

    km_h2o = h2o.kmeans(x=ozone_h2o,
                        k=ncent,
                        user_points=initial_centers_h2o_key,
                        standardize=True)
    print "H2O final centers"
    print km_h2o.centers()
def hdfs_kmeans():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_covtype_file = "/datasets/runit/covtype.data"

        print("Import iris_wheader.csv from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        n = iris_h2o.nrow
        print("rows: {0}".format(n))
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)

        print("Running KMeans on iris")
        iris_km = h2o.kmeans(training_frame = iris_h2o, k = 3, x = iris_h2o[0:4], max_iterations = 10)
        print(iris_km)

        print("Importing covtype.data from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
        covtype_h2o = h2o.import_file(url)
        n = covtype_h2o.nrow
        print("rows: {0}".format(n))
        assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012)

        print("Running KMeans on covtype")
        covtype_km = h2o.kmeans(training_frame = covtype_h2o, x = covtype_h2o[0:55], k = 8, max_iterations = 10)
        print(covtype_km)

    else:
        raise EnvironmentError
Exemplo n.º 37
0
def hdfs_kmeans(ip, port):
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = h2o.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = h2o.get_h2o_internal_hdfs_name_node()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_covtype_file = "/datasets/runit/covtype.data"

        print "Import iris_wheader.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        n = iris_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)

        print "Running KMeans on iris"
        iris_km = h2o.kmeans(training_frame = iris_h2o, k = 3, x = iris_h2o[0:4], max_iterations = 10)
        print iris_km

        print "Importing covtype.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
        covtype_h2o = h2o.import_file(url)
        n = covtype_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012)

        print "Running KMeans on covtype"
        covtype_km = h2o.kmeans(training_frame = covtype_h2o, x = covtype_h2o[0:55], k = 8, max_iterations = 10)
        print covtype_km

    else:
        print "Not running on H2O internal network.  No access to HDFS."
def ozoneKM():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/ozone.csv"))

    # See that the data is ready
    print(train.describe())

    # Run KMeans
    my_km = h2o.kmeans(x=train, k=10, init="PlusPlus", max_iterations=100)

    my_km.show()
    my_km.summary()

    my_pred = my_km.predict(train)
    my_pred.describe()
Exemplo n.º 39
0
def ozoneKM(ip, port):
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    train = h2o.import_file(path=h2o.locate("smalldata/glm_test/ozone.csv"))

    # See that the data is ready
    print train.describe()

    # Run KMeans
    my_km = h2o.kmeans(x=train, k=10, init="PlusPlus", max_iterations=100)

    my_km.show()
    my_km.summary()

    my_pred = my_km.predict(train)
    my_pred.describe()
def emptyclusKmeans():
    # Connect to a pre-existing cluster
      # connect to localhost:54321

    #Log.info("Importing ozone.csv data...\n")
    ozone_sci = np.loadtxt(pyunit_utils.locate("smalldata/glm_test/ozone.csv"), delimiter=',', skiprows=1)
    ozone_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/ozone.csv"))

    ncent = 10
    nempty = random.randint(1,ncent/2)
    initial_centers = [[41,190,67,7.4],
                       [36,118,72,8],
                       [12,149,74,12.6],
                       [18,313,62,11.5],
                       [23,299,65,8.6],
                       [19,99,59,13.8],
                       [8,19,61,20.1],
                       [16,256,69,9.7],
                       [11,290,66,9.2],
                       [14,274,68,10.9]]
    for i in random.sample(range(0,ncent-1), nempty):
        initial_centers[i] = [100*i for z in range(1,len(initial_centers[0])+1)]

    initial_centers_sci = np.asarray(initial_centers)
    initial_centers = zip(*initial_centers)

    initial_centers_h2o = h2o.H2OFrame(initial_centers)


    #Log.info("Initial cluster centers:")
    print "H2O initial centers:"
    initial_centers_h2o.show()
    print "scikit initial centers:"
    print initial_centers_sci

    # H2O can handle empty clusters and so can scikit
    #Log.info("Check that H2O can handle badly initialized centers")
    km_sci = KMeans(n_clusters=ncent, init=initial_centers_sci, n_init=1)
    km_sci.fit(preprocessing.scale(ozone_sci))
    print "scikit final centers"
    print km_sci.cluster_centers_

    km_h2o = h2o.kmeans(x=ozone_h2o, k=ncent, user_points=initial_centers_h2o, standardize=True)
    print "H2O final centers"
    print km_h2o.centers()
Exemplo n.º 41
0
def prostateKmeans():
  # Connect to a pre-existing cluster
    # connect to localhost:54321

  #Log.info("Importing prostate.csv data...\n")
  prostate_h2o = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv"))
  #prostate.summary()

  prostate_sci = np.loadtxt(tests.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1)
  prostate_sci = prostate_sci[:,1:]
  
  for i in range(5,9):
    #Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))
    #Log.info(paste( "Using these columns: ", colnames(prostate.hex)[-1]) )
    prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=i)
    prostate_km_h2o.show()

    prostate_km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
    prostate_km_sci.fit(prostate_sci)
    print prostate_km_sci.cluster_centers_
def prostateKmeans():
  # Connect to a pre-existing cluster
    # connect to localhost:54321

  #Log.info("Importing prostate.csv data...\n")
  prostate_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
  #prostate.summary()

  prostate_sci = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1)
  prostate_sci = prostate_sci[:,1:]
  
  for i in range(5,9):
    #Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))
    #Log.info(paste( "Using these columns: ", colnames(prostate.hex)[-1]) )
    prostate_km_h2o = h2o.kmeans(x=prostate_h2o[1:], k=i)
    prostate_km_h2o.show()

    prostate_km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
    prostate_km_sci.fit(prostate_sci)
    print prostate_km_sci.cluster_centers_
def benignKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    #  Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    # benign_h2o.summary()

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))
    for i in range(1, 7):
        benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i)
        print("H2O centers")
        print(benign_h2o_km.centers())

        benign_sci_km = KMeans(n_clusters=i, init="k-means++", n_init=1)
        benign_sci_km.fit(benign_sci)
        print("sckit centers")
        print(benign_sci_km.cluster_centers_)
def get_modelKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    # benign_h2o.summary()

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = h2o.kmeans(x=benign_h2o, k=i)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1)
        km_sci.fit(benign_sci)
        print("sckit centers")
        print(km_sci.cluster_centers_)
def hdfs_kmeans_airlines():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_file = "/datasets/airlines_all.csv"

        print "Import airlines_all.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_file(url)
        n = airlines_h2o.nrow
        print "rows: {0}".format(n)

        print "Run k-means++ with k = 7 and max_iterations = 10"
        myX = range(8) + range(11,16) + range(18,21) + range(24,29) + [9]
        airlines_km = h2o.kmeans(training_frame = airlines_h2o, x = airlines_h2o[myX], k = 7, init = "Furthest",
                                 max_iterations = 10, standardize = True)
        print airlines_km
    else:
        print "Not running on H2O internal network.  No access to HDFS."
def hdfs_kmeans_airlines():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_file = "/datasets/airlines_all.csv"

        print("Import airlines_all.csv from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_file(url)
        n = airlines_h2o.nrow
        print("rows: {0}".format(n))

        print("Run k-means++ with k = 7 and max_iterations = 10")
        myX = list(range(8)) + list(range(11,16)) + list(range(18,21)) + list(range(24,29)) + [9]
        airlines_km = h2o.kmeans(training_frame = airlines_h2o, x = airlines_h2o[myX], k = 7, init = "Furthest",
                                 max_iterations = 10, standardize = True)
        print(airlines_km)
    else:
        raise EnvironmentError
Exemplo n.º 47
0
def metric_json_check(ip, port):
    

    df = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv"))

    # Regression metric json
    reg_mod = h2o.gbm(y=df["CAPSULE"], x=df[3:], training_frame=df, distribution="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = reg_met._metric_json.keys()
    reg_metric_json_keys_desired = [u'model_category',
                                    u'description',
                                    u'r2',
                                    u'frame',
                                    u'model_checksum',
                                    u'MSE',
                                    u'__meta',
                                    u'scoring_time',
                                    u'predictions',
                                    u'model',
                                    u'duration_in_ms',
                                    u'frame_checksum',
                                    u'mean_residual_deviance']
    reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)
    # Regression metric json (GLM)
    reg_mod = h2o.glm(y=df["CAPSULE"], x=df[3:], training_frame=df, family="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = reg_met._metric_json.keys()
    reg_metric_json_keys_desired = [u'model_category',
                                    u'description',
                                    u'r2',
                                    u'residual_degrees_of_freedom',
                                    u'frame',
                                    u'model_checksum',
                                    u'MSE',
                                    u'__meta',
                                    u'null_deviance',
                                    u'scoring_time',
                                    u'null_degrees_of_freedom',
                                    u'predictions',
                                    u'AIC',
                                    u'model',
                                    u'duration_in_ms',
                                    u'frame_checksum',
                                    u'residual_deviance',
                                    u'mean_residual_deviance']
    reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)

    # Binomial metric json
    bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, distribution="bernoulli")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = bin_met._metric_json.keys()
    bin_metric_json_keys_desired = [u'AUC',
                                    u'Gini',
                                    u'model_category',
                                    u'description',
                                    u'r2',
                                    u'frame',
                                    u'model_checksum',
                                    u'MSE',
                                    u'__meta',
                                    u'logloss',
                                    u'scoring_time',
                                    u'thresholds_and_metric_scores',
                                    u'predictions',
                                    u'max_criteria_and_metric_scores',
                                    u'model',
                                    u'duration_in_ms',
                                    u'frame_checksum',
                                    u'domain']
    bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Binomial metric json (GLM)
    bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, family="binomial")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = bin_met._metric_json.keys()
    bin_metric_json_keys_desired = [u'frame',
                                    u'residual_deviance',
                                    u'max_criteria_and_metric_scores',
                                    u'MSE',
                                    u'frame_checksum',
                                    u'AIC',
                                    u'logloss',
                                    u'Gini',
                                    u'predictions',
                                    u'AUC',
                                    u'description',
                                    u'model_checksum',
                                    u'duration_in_ms',
                                    u'model_category',
                                    u'r2',
                                    u'residual_degrees_of_freedom',
                                    u'__meta',
                                    u'null_deviance',
                                    u'scoring_time',
                                    u'null_degrees_of_freedom',
                                    u'model',
                                    u'thresholds_and_metric_scores',
                                    u'domain']
    bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Multinomial metric json
    df = h2o.import_file(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    myX = ["Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance", "fDayofMonth", "fDayOfWeek"]
    myY = "fYear"
    mul_mod = h2o.gbm(x=df[myX], y=df[myY], training_frame=df, distribution="multinomial")
    mul_met = mul_mod.model_performance()
    mul_metric_json_keys_have = mul_met._metric_json.keys()
    mul_metric_json_keys_desired = [u'cm',
                                    u'model_category',
                                    u'description',
                                    u'r2',
                                    u'frame',
                                    u'model_checksum',
                                    u'MSE',
                                    u'__meta',
                                    u'logloss',
                                    u'scoring_time',
                                    u'predictions',
                                    u'hit_ratio_table',
                                    u'model',
                                    u'duration_in_ms',
                                    u'frame_checksum']
    mul_metric_diff = list(set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired))
    assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \
                                "metric json. The difference is {2}".format(mul_metric_json_keys_have,
                                                                            mul_metric_json_keys_desired,
                                                                            mul_metric_diff)

    # Clustering metric json
    df = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))
    clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False)
    clus_met = clus_mod.model_performance()
    clus_metric_json_keys_have = clus_met._metric_json.keys()
    clus_metric_json_keys_desired = [u'tot_withinss',
                                     u'model_category',
                                     u'description',
                                     u'frame',
                                     u'model_checksum',
                                     u'MSE',
                                     u'__meta',
                                     u'scoring_time',
                                     u'betweenss',
                                     u'predictions',
                                     u'totss',
                                     u'model',
                                     u'duration_in_ms',
                                     u'frame_checksum',
                                     u'centroid_stats']
    clus_metric_diff = list(set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired))
    assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \
                                "metric json. The difference is {2}".format(clus_metric_json_keys_have,
                                                                            clus_metric_json_keys_desired,
                                                                            clus_metric_diff)
Exemplo n.º 48
0
def metric_json_check(ip, port):
    h2o.init(ip, port)

    df = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    # Regression metric json
    reg_mod = h2o.gbm(y=df["CAPSULE"], x=df[3:], training_frame=df, distribution="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = reg_met._metric_json.keys()
    reg_metric_json_keys_desired = [
        u"model_category",
        u"description",
        u"r2",
        u"frame",
        u"model_checksum",
        u"MSE",
        u"__meta",
        u"scoring_time",
        u"predictions",
        u"model",
        u"duration_in_ms",
        u"frame_checksum",
    ]
    reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, (
        "There's a difference between the current ({0}) and the desired ({1}) regression "
        "metric json. The difference is {2}".format(
            reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff
        )
    )
    # Regression metric json (GLM)
    reg_mod = h2o.glm(y=df["CAPSULE"], x=df[3:], training_frame=df, family="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = reg_met._metric_json.keys()
    reg_metric_json_keys_desired = [
        u"model_category",
        u"description",
        u"r2",
        u"residual_degrees_of_freedom",
        u"frame",
        u"model_checksum",
        u"MSE",
        u"__meta",
        u"null_deviance",
        u"scoring_time",
        u"null_degrees_of_freedom",
        u"predictions",
        u"AIC",
        u"model",
        u"duration_in_ms",
        u"frame_checksum",
        u"residual_deviance",
    ]
    reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, (
        "There's a difference between the current ({0}) and the desired ({1}) glm-regression "
        "metric json. The difference is {2}".format(
            reg_metric_json_keys_have, reg_metric_json_keys_desired, reg_metric_diff
        )
    )

    # Binomial metric json
    bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, distribution="bernoulli")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = bin_met._metric_json.keys()
    bin_metric_json_keys_desired = [
        u"AUC",
        u"Gini",
        u"model_category",
        u"description",
        u"r2",
        u"frame",
        u"model_checksum",
        u"MSE",
        u"__meta",
        u"logloss",
        u"scoring_time",
        u"thresholds_and_metric_scores",
        u"predictions",
        u"max_criteria_and_metric_scores",
        u"model",
        u"duration_in_ms",
        u"frame_checksum",
        u"domain",
    ]
    bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, (
        "There's a difference between the current ({0}) and the desired ({1}) binomial "
        "metric json. The difference is {2}".format(
            bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff
        )
    )

    # Binomial metric json (GLM)
    bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(), x=df[3:], training_frame=df, family="binomial")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = bin_met._metric_json.keys()
    bin_metric_json_keys_desired = [
        u"frame",
        u"residual_deviance",
        u"max_criteria_and_metric_scores",
        u"MSE",
        u"frame_checksum",
        u"AIC",
        u"logloss",
        u"Gini",
        u"predictions",
        u"AUC",
        u"description",
        u"model_checksum",
        u"duration_in_ms",
        u"model_category",
        u"r2",
        u"residual_degrees_of_freedom",
        u"__meta",
        u"null_deviance",
        u"scoring_time",
        u"null_degrees_of_freedom",
        u"model",
        u"thresholds_and_metric_scores",
        u"domain",
    ]
    bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, (
        "There's a difference between the current ({0}) and the desired ({1}) glm-binomial "
        "metric json. The difference is {2}".format(
            bin_metric_json_keys_have, bin_metric_json_keys_desired, bin_metric_diff
        )
    )

    # Multinomial metric json
    df = h2o.import_frame(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    myX = ["Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance", "fDayofMonth", "fDayOfWeek"]
    myY = "fYear"
    mul_mod = h2o.gbm(x=df[myX], y=df[myY], training_frame=df, distribution="multinomial")
    mul_met = mul_mod.model_performance()
    mul_metric_json_keys_have = mul_met._metric_json.keys()
    mul_metric_json_keys_desired = [
        u"cm",
        u"model_category",
        u"description",
        u"r2",
        u"frame",
        u"model_checksum",
        u"MSE",
        u"__meta",
        u"logloss",
        u"scoring_time",
        u"predictions",
        u"hit_ratio_table",
        u"model",
        u"duration_in_ms",
        u"frame_checksum",
    ]
    mul_metric_diff = list(set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired))
    assert not mul_metric_diff, (
        "There's a difference between the current ({0}) and the desired ({1}) multinomial "
        "metric json. The difference is {2}".format(
            mul_metric_json_keys_have, mul_metric_json_keys_desired, mul_metric_diff
        )
    )

    # Clustering metric json
    df = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
    clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False)
    clus_met = clus_mod.model_performance()
    clus_metric_json_keys_have = clus_met._metric_json.keys()
    clus_metric_json_keys_desired = [
        u"tot_withinss",
        u"model_category",
        u"description",
        u"frame",
        u"model_checksum",
        u"MSE",
        u"__meta",
        u"scoring_time",
        u"betweenss",
        u"predictions",
        u"totss",
        u"model",
        u"duration_in_ms",
        u"frame_checksum",
        u"centroid_stats",
    ]
    clus_metric_diff = list(set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired))
    assert not clus_metric_diff, (
        "There's a difference between the current ({0}) and the desired ({1}) clustering "
        "metric json. The difference is {2}".format(
            clus_metric_json_keys_have, clus_metric_json_keys_desired, clus_metric_diff
        )
    )
Exemplo n.º 49
0
def metric_accessors(ip, port):

    cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]

    # regression
    response_col = "economy"
    distribution = "gaussian"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   mse
    mse1 = gbm.mse(train=True, valid=False, xval=False)
    assert isinstance(mse1, float)

    mse2 = gbm.mse(train=False, valid=True, xval=False)
    assert isinstance(mse2, float)

    mse3 = gbm.mse(train=False, valid=False, xval=True)
    assert isinstance(mse3, float)

    mse = gbm.mse(train=True, valid=True, xval=False)
    assert "train" in mse.keys() and "valid" in mse.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["valid"]))
    assert mse["valid"] == mse2

    mse = gbm.mse(train=True, valid=False, xval=True)
    assert "train" in mse.keys() and "xval" in mse.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["xval"]))
    assert mse["xval"] == mse3

    mse = gbm.mse(train=True, valid=True, xval=True)
    assert "train" in mse.keys() and "valid" in mse.keys(
    ) and "xval" in mse.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ) and isinstance(
        mse["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(mse["train"]), type(mse["valid"]), type(mse["xval"]))

    mse = gbm.mse(train=False, valid=False,
                  xval=False)  # default: return training metrics
    assert isinstance(mse, float)
    assert mse == mse1

    mse = gbm.mse(train=False, valid=True, xval=True)
    assert "valid" in mse.keys() and "xval" in mse.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["valid"], float) and isinstance(
        mse["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["valid"]), type(mse["xval"]))

    #   r2
    r21 = gbm.r2(train=True, valid=False, xval=False)
    assert isinstance(r21, float)

    r22 = gbm.r2(train=False, valid=True, xval=False)
    assert isinstance(r22, float)

    r23 = gbm.r2(train=False, valid=False, xval=True)
    assert isinstance(r23, float)

    r2 = gbm.r2(train=True, valid=True, xval=False)
    assert "train" in r2.keys() and "valid" in r2.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert len(
        r2
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert isinstance(r2["train"], float) and isinstance(
        r2["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(r2["train"]), type(r2["valid"]))
    assert r2["valid"] == r22

    r2 = gbm.r2(train=True, valid=False, xval=True)
    assert "train" in r2.keys() and "xval" in r2.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert len(
        r2
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert isinstance(r2["train"], float) and isinstance(
        r2["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(r2["train"]), type(r2["xval"]))
    assert r2["xval"] == r23

    r2 = gbm.r2(train=True, valid=True, xval=True)
    assert "train" in r2.keys() and "valid" in r2.keys() and "xval" in r2.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert len(
        r2
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert isinstance(r2["train"], float) and isinstance(
        r2["valid"], float
    ) and isinstance(
        r2["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(r2["train"]), type(r2["valid"]), type(r2["xval"]))

    r2 = gbm.r2(train=False, valid=False,
                xval=False)  # default: return training metrics
    assert isinstance(r2, float)
    assert r2 == r21

    r2 = gbm.r2(train=False, valid=True, xval=True)
    assert "valid" in r2.keys() and "xval" in r2.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert len(
        r2
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        r2.keys())
    assert isinstance(r2["valid"], float) and isinstance(
        r2["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(r2["valid"]), type(r2["xval"]))

    #   mean_residual_deviance
    mean_residual_deviance1 = gbm.mean_residual_deviance(train=True,
                                                         valid=False,
                                                         xval=False)
    assert isinstance(mean_residual_deviance1, float)

    mean_residual_deviance2 = gbm.mean_residual_deviance(train=False,
                                                         valid=True,
                                                         xval=False)
    assert isinstance(mean_residual_deviance2, float)

    mean_residual_deviance3 = gbm.mean_residual_deviance(train=False,
                                                         valid=False,
                                                         xval=True)
    assert isinstance(mean_residual_deviance3, float)

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,
                                                        valid=True,
                                                        xval=False)
    assert "train" in mean_residual_deviance.keys(
    ) and "valid" in mean_residual_deviance.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert len(
        mean_residual_deviance
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(
        mean_residual_deviance["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(mean_residual_deviance["train"]),
        type(mean_residual_deviance["valid"]))
    assert mean_residual_deviance["valid"] == mean_residual_deviance2

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,
                                                        valid=False,
                                                        xval=True)
    assert "train" in mean_residual_deviance.keys(
    ) and "xval" in mean_residual_deviance.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert len(
        mean_residual_deviance
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(
        mean_residual_deviance["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mean_residual_deviance["train"]),
        type(mean_residual_deviance["xval"]))
    assert mean_residual_deviance["xval"] == mean_residual_deviance3

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,
                                                        valid=True,
                                                        xval=True)
    assert "train" in mean_residual_deviance.keys(
    ) and "valid" in mean_residual_deviance.keys(
    ) and "xval" in mean_residual_deviance.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert len(
        mean_residual_deviance
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(
        mean_residual_deviance["valid"], float
    ) and isinstance(
        mean_residual_deviance["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(mean_residual_deviance["train"]),
        type(mean_residual_deviance["valid"]),
        type(mean_residual_deviance["xval"]))

    mean_residual_deviance = gbm.mean_residual_deviance(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    assert isinstance(mean_residual_deviance, float)
    assert mean_residual_deviance == mean_residual_deviance1

    mean_residual_deviance = gbm.mean_residual_deviance(train=False,
                                                        valid=True,
                                                        xval=True)
    assert "valid" in mean_residual_deviance.keys(
    ) and "xval" in mean_residual_deviance.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert len(
        mean_residual_deviance
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["valid"], float) and isinstance(
        mean_residual_deviance["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mean_residual_deviance["valid"]),
        type(mean_residual_deviance["xval"]))

    # binomial
    cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "economy_20mpg"
    distribution = "bernoulli"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   auc
    auc1 = gbm.auc(train=True, valid=False, xval=False)
    assert isinstance(auc1, float)

    auc2 = gbm.auc(train=False, valid=True, xval=False)
    assert isinstance(auc2, float)

    auc3 = gbm.auc(train=False, valid=False, xval=True)
    assert isinstance(auc3, float)

    auc = gbm.auc(train=True, valid=True, xval=False)
    assert "train" in auc.keys() and "valid" in auc.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert len(
        auc
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert isinstance(auc["train"], float) and isinstance(
        auc["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(auc["train"]), type(auc["valid"]))
    assert auc["valid"] == auc2

    auc = gbm.auc(train=True, valid=False, xval=True)
    assert "train" in auc.keys() and "xval" in auc.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert len(
        auc
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert isinstance(auc["train"], float) and isinstance(
        auc["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(auc["train"]), type(auc["xval"]))
    assert auc["xval"] == auc3

    auc = gbm.auc(train=True, valid=True, xval=True)
    assert "train" in auc.keys() and "valid" in auc.keys(
    ) and "xval" in auc.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert len(
        auc
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert isinstance(auc["train"], float) and isinstance(
        auc["valid"], float
    ) and isinstance(
        auc["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(auc["train"]), type(auc["valid"]), type(auc["xval"]))

    auc = gbm.auc(train=False, valid=False,
                  xval=False)  # default: return training metrics
    assert isinstance(auc, float)
    assert auc == auc1

    auc = gbm.auc(train=False, valid=True, xval=True)
    assert "valid" in auc.keys() and "xval" in auc.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert len(
        auc
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        auc.keys())
    assert isinstance(auc["valid"], float) and isinstance(
        auc["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(auc["valid"]), type(auc["xval"]))

    #   logloss
    logloss1 = gbm.logloss(train=True, valid=False, xval=False)
    assert isinstance(logloss1, float)

    logloss2 = gbm.logloss(train=False, valid=True, xval=False)
    assert isinstance(logloss2, float)

    logloss3 = gbm.logloss(train=False, valid=False, xval=True)
    assert isinstance(logloss3, float)

    logloss = gbm.logloss(train=True, valid=True, xval=False)
    assert "train" in logloss.keys() and "valid" in logloss.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["valid"]))
    assert logloss["valid"] == logloss2

    logloss = gbm.logloss(train=True, valid=False, xval=True)
    assert "train" in logloss.keys() and "xval" in logloss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["xval"]))
    assert logloss["xval"] == logloss3

    logloss = gbm.logloss(train=True, valid=True, xval=True)
    assert "train" in logloss.keys() and "valid" in logloss.keys(
    ) and "xval" in logloss.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ) and isinstance(
        logloss["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"]))

    logloss = gbm.logloss(train=False, valid=False,
                          xval=False)  # default: return training metrics
    assert isinstance(logloss, float)
    assert logloss == logloss1

    logloss = gbm.logloss(train=False, valid=True, xval=True)
    assert "valid" in logloss.keys() and "xval" in logloss.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["valid"], float) and isinstance(
        logloss["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["valid"]), type(logloss["xval"]))

    #   giniCoef
    giniCoef1 = gbm.giniCoef(train=True, valid=False, xval=False)
    assert isinstance(giniCoef1, float)

    giniCoef2 = gbm.giniCoef(train=False, valid=True, xval=False)
    assert isinstance(giniCoef2, float)

    giniCoef3 = gbm.giniCoef(train=False, valid=False, xval=True)
    assert isinstance(giniCoef3, float)

    giniCoef = gbm.giniCoef(train=True, valid=True, xval=False)
    assert "train" in giniCoef.keys() and "valid" in giniCoef.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert len(
        giniCoef
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert isinstance(giniCoef["train"], float) and isinstance(
        giniCoef["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(giniCoef["train"]), type(giniCoef["valid"]))
    assert giniCoef["valid"] == giniCoef2

    giniCoef = gbm.giniCoef(train=True, valid=False, xval=True)
    assert "train" in giniCoef.keys() and "xval" in giniCoef.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert len(
        giniCoef
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert isinstance(giniCoef["train"], float) and isinstance(
        giniCoef["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(giniCoef["train"]), type(giniCoef["xval"]))
    assert giniCoef["xval"] == giniCoef3

    giniCoef = gbm.giniCoef(train=True, valid=True, xval=True)
    assert "train" in giniCoef.keys() and "valid" in giniCoef.keys(
    ) and "xval" in giniCoef.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert len(
        giniCoef
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert isinstance(giniCoef["train"], float) and isinstance(
        giniCoef["valid"], float
    ) and isinstance(
        giniCoef["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(giniCoef["train"]), type(giniCoef["valid"]),
        type(giniCoef["xval"]))

    giniCoef = gbm.giniCoef(train=False, valid=False,
                            xval=False)  # default: return training metrics
    assert isinstance(giniCoef, float)
    assert giniCoef == giniCoef1

    giniCoef = gbm.giniCoef(train=False, valid=True, xval=True)
    assert "valid" in giniCoef.keys() and "xval" in giniCoef.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert len(
        giniCoef
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        giniCoef.keys())
    assert isinstance(giniCoef["valid"], float) and isinstance(
        giniCoef["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(giniCoef["valid"]), type(giniCoef["xval"]))

    #   F1
    F11 = gbm.F1(train=True, valid=False, xval=False)
    F12 = gbm.F1(train=False, valid=True, xval=False)
    F13 = gbm.F1(train=False, valid=False, xval=True)
    F1 = gbm.F1(train=True, valid=True, xval=False)
    F1 = gbm.F1(train=True, valid=False, xval=True)
    F1 = gbm.F1(train=True, valid=True, xval=True)
    F1 = gbm.F1(train=False, valid=False,
                xval=False)  # default: return training metrics
    F1 = gbm.F1(train=False, valid=True, xval=True)

    #   F0point5
    F0point51 = gbm.F0point5(train=True, valid=False, xval=False)
    F0point52 = gbm.F0point5(train=False, valid=True, xval=False)
    F0point53 = gbm.F0point5(train=False, valid=False, xval=True)
    F0point5 = gbm.F0point5(train=True, valid=True, xval=False)
    F0point5 = gbm.F0point5(train=True, valid=False, xval=True)
    F0point5 = gbm.F0point5(train=True, valid=True, xval=True)
    F0point5 = gbm.F0point5(train=False, valid=False,
                            xval=False)  # default: return training metrics
    F0point5 = gbm.F0point5(train=False, valid=True, xval=True)

    #   F2
    F21 = gbm.F2(train=True, valid=False, xval=False)
    F22 = gbm.F2(train=False, valid=True, xval=False)
    F23 = gbm.F2(train=False, valid=False, xval=True)
    F2 = gbm.F2(train=True, valid=True, xval=False)
    F2 = gbm.F2(train=True, valid=False, xval=True)
    F2 = gbm.F2(train=True, valid=True, xval=True)
    F2 = gbm.F2(train=False, valid=False,
                xval=False)  # default: return training metrics
    F2 = gbm.F2(train=False, valid=True, xval=True)

    #   accuracy
    accuracy1 = gbm.accuracy(train=True, valid=False, xval=False)
    accuracy2 = gbm.accuracy(train=False, valid=True, xval=False)
    accuracy3 = gbm.accuracy(train=False, valid=False, xval=True)
    accuracy = gbm.accuracy(train=True, valid=True, xval=False)
    accuracy = gbm.accuracy(train=True, valid=False, xval=True)
    accuracy = gbm.accuracy(train=True, valid=True, xval=True)
    accuracy = gbm.accuracy(train=False, valid=False,
                            xval=False)  # default: return training metrics
    accuracy = gbm.accuracy(train=False, valid=True, xval=True)

    #   error
    error1 = gbm.error(train=True, valid=False, xval=False)
    error2 = gbm.error(train=False, valid=True, xval=False)
    error3 = gbm.error(train=False, valid=False, xval=True)
    error = gbm.error(train=True, valid=True, xval=False)
    error = gbm.error(train=True, valid=False, xval=True)
    error = gbm.error(train=True, valid=True, xval=True)
    error = gbm.error(train=False, valid=False,
                      xval=False)  # default: return training metrics
    error = gbm.error(train=False, valid=True, xval=True)

    #   precision
    precision1 = gbm.precision(train=True, valid=False, xval=False)
    precision2 = gbm.precision(train=False, valid=True, xval=False)
    precision3 = gbm.precision(train=False, valid=False, xval=True)
    precision = gbm.precision(train=True, valid=True, xval=False)
    precision = gbm.precision(train=True, valid=False, xval=True)
    precision = gbm.precision(train=True, valid=True, xval=True)
    precision = gbm.precision(train=False, valid=False,
                              xval=False)  # default: return training metrics
    precision = gbm.precision(train=False, valid=True, xval=True)

    #   mcc
    mcc1 = gbm.mcc(train=True, valid=False, xval=False)
    mcc2 = gbm.mcc(train=False, valid=True, xval=False)
    mcc3 = gbm.mcc(train=False, valid=False, xval=True)
    mcc = gbm.mcc(train=True, valid=True, xval=False)
    mcc = gbm.mcc(train=True, valid=False, xval=True)
    mcc = gbm.mcc(train=True, valid=True, xval=True)
    mcc = gbm.mcc(train=False, valid=False,
                  xval=False)  # default: return training metrics
    mcc = gbm.mcc(train=False, valid=True, xval=True)

    #   max_per_class_error
    max_per_class_error1 = gbm.max_per_class_error(train=True,
                                                   valid=False,
                                                   xval=False)
    max_per_class_error2 = gbm.max_per_class_error(train=False,
                                                   valid=True,
                                                   xval=False)
    max_per_class_error3 = gbm.max_per_class_error(train=False,
                                                   valid=False,
                                                   xval=True)
    max_per_class_error = gbm.max_per_class_error(train=True,
                                                  valid=True,
                                                  xval=False)
    max_per_class_error = gbm.max_per_class_error(train=True,
                                                  valid=False,
                                                  xval=True)
    max_per_class_error = gbm.max_per_class_error(train=True,
                                                  valid=True,
                                                  xval=True)
    max_per_class_error = gbm.max_per_class_error(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    max_per_class_error = gbm.max_per_class_error(train=False,
                                                  valid=True,
                                                  xval=True)

    #   confusion_matrix
    confusion_matrix1 = gbm.confusion_matrix(train=True,
                                             valid=False,
                                             xval=False)
    confusion_matrix2 = gbm.confusion_matrix(train=False,
                                             valid=True,
                                             xval=False)
    confusion_matrix3 = gbm.confusion_matrix(train=False,
                                             valid=False,
                                             xval=True)
    confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=False)
    confusion_matrix = gbm.confusion_matrix(train=True, valid=False, xval=True)
    confusion_matrix = gbm.confusion_matrix(train=True, valid=True, xval=True)
    confusion_matrix = gbm.confusion_matrix(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    confusion_matrix = gbm.confusion_matrix(train=False, valid=True, xval=True)

    # #   plot
    # plot1 = gbm.plot(train=True,  valid=False, xval=False)
    # plot2 = gbm.plot(train=False, valid=True,  xval=False)
    # plot3 = gbm.plot(train=False, valid=False, xval=True)
    # plot = gbm.plot(train=True,  valid=True,  xval=False)
    # plot = gbm.plot(train=True,  valid=False, xval=True)
    # plot = gbm.plot(train=True,  valid=True,  xval=True)
    # plot = gbm.plot(train=False, valid=False, xval=False) # default: return training metrics
    # plot = gbm.plot(train=False, valid=True,  xval=True)

    # #   tpr
    # tpr1 = gbm.tpr(train=True,  valid=False, xval=False)
    # tpr2 = gbm.tpr(train=False, valid=True,  xval=False)
    # tpr3 = gbm.tpr(train=False, valid=False, xval=True)
    # tpr = gbm.tpr(train=True,  valid=True,  xval=False)
    # tpr = gbm.tpr(train=True,  valid=False, xval=True)
    # tpr = gbm.tpr(train=True,  valid=True,  xval=True)
    # tpr = gbm.tpr(train=False, valid=False, xval=False) # default: return training metrics
    # tpr = gbm.tpr(train=False, valid=True,  xval=True)
    #
    # #   tnr
    # tnr1 = gbm.tnr(train=True,  valid=False, xval=False)
    # tnr2 = gbm.tnr(train=False, valid=True,  xval=False)
    # tnr3 = gbm.tnr(train=False, valid=False, xval=True)
    # tnr = gbm.tnr(train=True,  valid=True,  xval=False)
    # tnr = gbm.tnr(train=True,  valid=False, xval=True)
    # tnr = gbm.tnr(train=True,  valid=True,  xval=True)
    # tnr = gbm.tnr(train=False, valid=False, xval=False) # default: return training metrics
    # tnr = gbm.tnr(train=False, valid=True,  xval=True)
    #
    # #   fnr
    # fnr1 = gbm.fnr(train=True,  valid=False, xval=False)
    # fnr2 = gbm.fnr(train=False, valid=True,  xval=False)
    # fnr3 = gbm.fnr(train=False, valid=False, xval=True)
    # fnr = gbm.fnr(train=True,  valid=True,  xval=False)
    # fnr = gbm.fnr(train=True,  valid=False, xval=True)
    # fnr = gbm.fnr(train=True,  valid=True,  xval=True)
    # fnr = gbm.fnr(train=False, valid=False, xval=False) # default: return training metrics
    # fnr = gbm.fnr(train=False, valid=True,  xval=True)
    #
    # #   fpr
    # fpr1 = gbm.fpr(train=True,  valid=False, xval=False)
    # fpr2 = gbm.fpr(train=False, valid=True,  xval=False)
    # fpr3 = gbm.fpr(train=False, valid=False, xval=True)
    # fpr = gbm.fpr(train=True,  valid=True,  xval=False)
    # fpr = gbm.fpr(train=True,  valid=False, xval=True)
    # fpr = gbm.fpr(train=True,  valid=True,  xval=True)
    # fpr = gbm.fpr(train=False, valid=False, xval=False) # default: return training metrics
    # fpr = gbm.fpr(train=False, valid=True,  xval=True)

    # multinomial
    cars = h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars["cylinders"] = cars["cylinders"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "cylinders"
    distribution = "multinomial"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   mse
    mse1 = gbm.mse(train=True, valid=False, xval=False)
    assert isinstance(mse1, float)

    mse2 = gbm.mse(train=False, valid=True, xval=False)
    assert isinstance(mse2, float)

    mse3 = gbm.mse(train=False, valid=False, xval=True)
    assert isinstance(mse3, float)

    mse = gbm.mse(train=True, valid=True, xval=False)
    assert "train" in mse.keys() and "valid" in mse.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["valid"]))
    assert mse["valid"] == mse2

    mse = gbm.mse(train=True, valid=False, xval=True)
    assert "train" in mse.keys() and "xval" in mse.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["train"]), type(mse["xval"]))
    assert mse["xval"] == mse3

    mse = gbm.mse(train=True, valid=True, xval=True)
    assert "train" in mse.keys() and "valid" in mse.keys(
    ) and "xval" in mse.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["train"], float) and isinstance(
        mse["valid"], float
    ) and isinstance(
        mse["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(mse["train"]), type(mse["valid"]), type(mse["xval"]))

    mse = gbm.mse(train=False, valid=False,
                  xval=False)  # default: return training metrics
    assert isinstance(mse, float)
    assert mse == mse1

    mse = gbm.mse(train=False, valid=True, xval=True)
    assert "valid" in mse.keys() and "xval" in mse.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert len(
        mse
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        mse.keys())
    assert isinstance(mse["valid"], float) and isinstance(
        mse["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(mse["valid"]), type(mse["xval"]))

    #   logloss
    logloss1 = gbm.logloss(train=True, valid=False, xval=False)
    assert isinstance(logloss1, float)

    logloss2 = gbm.logloss(train=False, valid=True, xval=False)
    assert isinstance(logloss2, float)

    logloss3 = gbm.logloss(train=False, valid=False, xval=True)
    assert isinstance(logloss3, float)

    logloss = gbm.logloss(train=True, valid=True, xval=False)
    assert "train" in logloss.keys() and "valid" in logloss.keys(
    ), "expected training and validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected only training and validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ), "expected training and validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["valid"]))
    assert logloss["valid"] == logloss2

    logloss = gbm.logloss(train=True, valid=False, xval=True)
    assert "train" in logloss.keys() and "xval" in logloss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["train"]), type(logloss["xval"]))
    assert logloss["xval"] == logloss3

    logloss = gbm.logloss(train=True, valid=True, xval=True)
    assert "train" in logloss.keys() and "valid" in logloss.keys(
    ) and "xval" in logloss.keys(
    ), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(
        logloss["valid"], float
    ) and isinstance(
        logloss["xval"], float
    ), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(
        type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"]))

    logloss = gbm.logloss(train=False, valid=False,
                          xval=False)  # default: return training metrics
    assert isinstance(logloss, float)
    assert logloss == logloss1

    logloss = gbm.logloss(train=False, valid=True, xval=True)
    assert "valid" in logloss.keys() and "xval" in logloss.keys(
    ), "expected validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert len(
        logloss
    ) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(
        logloss.keys())
    assert isinstance(logloss["valid"], float) and isinstance(
        logloss["xval"], float
    ), "validation and cross validation metrics to be floats, but got {0} and {1}".format(
        type(logloss["valid"]), type(logloss["xval"]))

    #   hit_ratio_table
    hit_ratio_table1 = gbm.hit_ratio_table(train=True, valid=False, xval=False)
    hit_ratio_table2 = gbm.hit_ratio_table(train=False, valid=True, xval=False)
    hit_ratio_table3 = gbm.hit_ratio_table(train=False, valid=False, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=False)
    hit_ratio_table = gbm.hit_ratio_table(train=True, valid=False, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(train=True, valid=True, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    hit_ratio_table = gbm.hit_ratio_table(train=False, valid=True, xval=True)

    # clustering
    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))
    km = h2o.kmeans(x=iris[0:4], nfolds=3, k=3)

    #   betweenss
    betweenss1 = km.betweenss(train=True, valid=False, xval=False)
    assert isinstance(betweenss1, float)

    betweenss3 = km.betweenss(train=False, valid=False, xval=True)
    assert isinstance(betweenss3, float)

    betweenss = km.betweenss(train=True, valid=False, xval=True)
    assert "train" in betweenss.keys() and "xval" in betweenss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        betweenss.keys())
    assert len(
        betweenss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        betweenss.keys())
    assert isinstance(betweenss["train"], float) and isinstance(
        betweenss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(betweenss["train"]), type(betweenss["xval"]))
    assert betweenss["xval"] == betweenss3

    betweenss = km.betweenss(train=False, valid=False,
                             xval=False)  # default: return training metrics
    assert isinstance(betweenss, float)
    assert betweenss == betweenss1

    #   totss
    totss1 = km.totss(train=True, valid=False, xval=False)
    assert isinstance(totss1, float)

    totss3 = km.totss(train=False, valid=False, xval=True)
    assert isinstance(totss3, float)

    totss = km.totss(train=True, valid=False, xval=True)
    assert "train" in totss.keys() and "xval" in totss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        totss.keys())
    assert len(
        totss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        totss.keys())
    assert isinstance(totss["train"], float) and isinstance(
        totss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(totss["train"]), type(totss["xval"]))
    assert totss["xval"] == totss3

    totss = km.totss(train=False, valid=False,
                     xval=False)  # default: return training metrics
    assert isinstance(totss, float)
    assert totss == totss1

    #   tot_withinss
    tot_withinss1 = km.tot_withinss(train=True, valid=False, xval=False)
    assert isinstance(tot_withinss1, float)

    tot_withinss3 = km.tot_withinss(train=False, valid=False, xval=True)
    assert isinstance(tot_withinss3, float)

    tot_withinss = km.tot_withinss(train=True, valid=False, xval=True)
    assert "train" in tot_withinss.keys() and "xval" in tot_withinss.keys(
    ), "expected training and cross validation metrics to be returned, but got {0}".format(
        tot_withinss.keys())
    assert len(
        tot_withinss
    ) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(
        tot_withinss.keys())
    assert isinstance(tot_withinss["train"], float) and isinstance(
        tot_withinss["xval"], float
    ), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(
        type(tot_withinss["train"]), type(tot_withinss["xval"]))
    assert tot_withinss["xval"] == tot_withinss3

    tot_withinss = km.tot_withinss(
        train=False, valid=False,
        xval=False)  # default: return training metrics
    assert isinstance(tot_withinss, float)
    assert tot_withinss == tot_withinss1

    #   withinss
    withinss1 = km.withinss(train=True, valid=False, xval=False)
    withinss3 = km.withinss(train=False, valid=False, xval=True)
    withinss = km.withinss(train=True, valid=False, xval=True)
    withinss = km.withinss(train=False, valid=False,
                           xval=False)  # default: return training metrics

    #   centroid_stats
    centroid_stats1 = km.centroid_stats(train=True, valid=False, xval=False)
    centroid_stats3 = km.centroid_stats(train=False, valid=False, xval=True)
    centroid_stats = km.centroid_stats(train=True, valid=False, xval=True)
    centroid_stats = km.centroid_stats(
        train=False, valid=False,
        xval=False)  # default: return training metrics

    #   size
    size1 = km.size(train=True, valid=False, xval=False)
    size3 = km.size(train=False, valid=False, xval=True)
    size = km.size(train=True, valid=False, xval=True)
    size = km.size(train=False, valid=False,
                   xval=False)  # default: return training metrics
def metric_json_check():

    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    # Regression metric json
    reg_mod = h2o.gbm(y=df["CAPSULE"],
                      x=df[3:],
                      training_frame=df,
                      distribution="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = reg_met._metric_json.keys()
    reg_metric_json_keys_desired = [
        u'model_category', u'description', u'r2', u'frame', u'model_checksum',
        u'MSE', u'__meta', u'scoring_time', u'predictions', u'model',
        u'duration_in_ms', u'frame_checksum', u'mean_residual_deviance'
    ]
    reg_metric_diff = list(
        set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)
    # Regression metric json (GLM)
    reg_mod = h2o.glm(y=df["CAPSULE"],
                      x=df[3:],
                      training_frame=df,
                      family="gaussian")
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = reg_met._metric_json.keys()
    reg_metric_json_keys_desired = [
        u'model_category', u'description', u'r2',
        u'residual_degrees_of_freedom', u'frame', u'model_checksum', u'MSE',
        u'__meta', u'null_deviance', u'scoring_time',
        u'null_degrees_of_freedom', u'predictions', u'AIC', u'model',
        u'duration_in_ms', u'frame_checksum', u'residual_deviance',
        u'mean_residual_deviance'
    ]
    reg_metric_diff = list(
        set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)

    # Binomial metric json
    bin_mod = h2o.gbm(y=df["CAPSULE"].asfactor(),
                      x=df[3:],
                      training_frame=df,
                      distribution="bernoulli")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = bin_met._metric_json.keys()
    bin_metric_json_keys_desired = [
        u'AUC', u'Gini', u'model_category', u'description', u'r2', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time',
        u'thresholds_and_metric_scores', u'predictions',
        u'max_criteria_and_metric_scores', u'model', u'duration_in_ms',
        u'frame_checksum', u'domain'
    ]
    bin_metric_diff = list(
        set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Binomial metric json (GLM)
    bin_mod = h2o.glm(y=df["CAPSULE"].asfactor(),
                      x=df[3:],
                      training_frame=df,
                      family="binomial")
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = bin_met._metric_json.keys()
    bin_metric_json_keys_desired = [
        u'frame', u'residual_deviance', u'max_criteria_and_metric_scores',
        u'MSE', u'frame_checksum', u'AIC', u'logloss', u'Gini', u'predictions',
        u'AUC', u'description', u'model_checksum', u'duration_in_ms',
        u'model_category', u'r2', u'residual_degrees_of_freedom', u'__meta',
        u'null_deviance', u'scoring_time', u'null_degrees_of_freedom',
        u'model', u'thresholds_and_metric_scores', u'domain'
    ]
    bin_metric_diff = list(
        set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Multinomial metric json
    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    myX = [
        "Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance",
        "fDayofMonth", "fDayOfWeek"
    ]
    myY = "fYear"
    mul_mod = h2o.gbm(x=df[myX],
                      y=df[myY],
                      training_frame=df,
                      distribution="multinomial")
    mul_met = mul_mod.model_performance()
    mul_metric_json_keys_have = mul_met._metric_json.keys()
    mul_metric_json_keys_desired = [
        u'cm', u'model_category', u'description', u'r2', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'logloss', u'scoring_time',
        u'predictions', u'hit_ratio_table', u'model', u'duration_in_ms',
        u'frame_checksum'
    ]
    mul_metric_diff = list(
        set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired))
    assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \
                                "metric json. The difference is {2}".format(mul_metric_json_keys_have,
                                                                            mul_metric_json_keys_desired,
                                                                            mul_metric_diff)

    # Clustering metric json
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    clus_mod = h2o.kmeans(x=df[0:4], k=3, standardize=False)
    clus_met = clus_mod.model_performance()
    clus_metric_json_keys_have = clus_met._metric_json.keys()
    clus_metric_json_keys_desired = [
        u'tot_withinss', u'model_category', u'description', u'frame',
        u'model_checksum', u'MSE', u'__meta', u'scoring_time', u'betweenss',
        u'predictions', u'totss', u'model', u'duration_in_ms',
        u'frame_checksum', u'centroid_stats'
    ]
    clus_metric_diff = list(
        set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired))
    assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \
                                "metric json. The difference is {2}".format(clus_metric_json_keys_have,
                                                                            clus_metric_json_keys_desired,
                                                                            clus_metric_diff)
Exemplo n.º 51
0
def init_err_casesKmeans():
    # Connect to a pre-existing cluster
      # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()
    numcol = benign_h2o.ncol
    numrow = benign_h2o.nrow

    # Log.info("Non-numeric entry that isn't 'Random', 'PlusPlus', or 'Furthest'")
    try:
        h2o.kmeans(x=benign_h2o, k=5, init='Test123')
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Log.info("Empty list, tuple, or dictionary")
    try:
        h2o.kmeans(x=benign_h2o, k=0, user_points=[])
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.kmeans(x=benign_h2o, k=0, user_points=())
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.kmeans(x=benign_h2o, k=0, user_points={})
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Log.info("Number of columns doesn't equal training set's")
    start_small = [[random.gauss(0,1) for c in range(numcol-2)] for r in range(5)]
    start_large = [[random.gauss(0,1) for c in range(numcol+2)] for r in range(5)]

    try:
        h2o.kmeans(x=benign_h2o, k=5, user_points=h2o.H2OFrame(start_small))
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.kmeans(x=benign_h2o, k=5, user_points=h2o.H2OFrame(start_large))
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Log.info("Number of rows exceeds training set's")
    start = [[random.gauss(0,1) for c in range(numcol)] for r in range(numrow+2)]
    try:
        h2o.kmeans(x=benign_h2o, k=numrow+2, user_points=h2o.H2OFrame(start))
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Nones are replaced with mean of a column in H2O. Not sure about Inf.
    # Log.info("Any entry is NA, NaN, or Inf")
    start = [[random.gauss(0,1) for c in range(numcol)] for r in range(3)]
    for x in ["NA", "NaN", "Inf", "-Inf"]:
        start_err = start[:]
        start_err[1][random.randint(0,numcol-1)] = x
        h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start_err))

    # Duplicates will affect sampling probability during initialization.
    # Log.info("Duplicate initial clusters specified")
    start = [[random.gauss(0,1) for c in range(numcol)] for r in range(3)]
    start[2] = start[0]
    h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start))
Exemplo n.º 52
0
def metric_accessors(ip,port):

    cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]

    # regression
    response_col = "economy"
    distribution = "gaussian"
    predictors = ["displacement","power","weight","acceleration","year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   mse
    mse1 = gbm.mse(train=True,  valid=False, xval=False)
    assert isinstance(mse1, float)

    mse2 = gbm.mse(train=False, valid=True,  xval=False)
    assert isinstance(mse2, float)

    mse3 = gbm.mse(train=False, valid=False, xval=True)
    assert isinstance(mse3, float)

    mse = gbm.mse(train=True,  valid=True,  xval=False)
    assert "train" in mse.keys() and "valid" in mse.keys(), "expected training and validation metrics to be returned, but got {0}".format(mse.keys())
    assert len(mse) == 2, "expected only training and validation metrics to be returned, but got {0}".format(mse.keys())
    assert isinstance(mse["train"], float) and isinstance(mse["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(mse["train"]), type(mse["valid"]))
    assert mse["valid"] == mse2

    mse = gbm.mse(train=True,  valid=False, xval=True)
    assert "train" in mse.keys() and "xval" in mse.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert len(mse) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert isinstance(mse["train"], float) and isinstance(mse["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(mse["train"]), type(mse["xval"]))
    assert mse["xval"] == mse3

    mse = gbm.mse(train=True,  valid=True,  xval=True)
    assert "train" in mse.keys() and "valid" in mse.keys() and "xval" in mse.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert len(mse) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert isinstance(mse["train"], float) and isinstance(mse["valid"], float) and isinstance(mse["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(mse["train"]), type(mse["valid"]), type(mse["xval"]))

    mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(mse, float)
    assert mse == mse1

    mse = gbm.mse(train=False, valid=True,  xval=True)
    assert "valid" in mse.keys() and "xval" in mse.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert len(mse) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert isinstance(mse["valid"], float) and isinstance(mse["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(mse["valid"]), type(mse["xval"]))

    #   r2
    r21 = gbm.r2(train=True,  valid=False, xval=False)
    assert isinstance(r21, float)

    r22 = gbm.r2(train=False, valid=True,  xval=False)
    assert isinstance(r22, float)

    r23 = gbm.r2(train=False, valid=False, xval=True)
    assert isinstance(r23, float)

    r2 = gbm.r2(train=True,  valid=True,  xval=False)
    assert "train" in r2.keys() and "valid" in r2.keys(), "expected training and validation metrics to be returned, but got {0}".format(r2.keys())
    assert len(r2) == 2, "expected only training and validation metrics to be returned, but got {0}".format(r2.keys())
    assert isinstance(r2["train"], float) and isinstance(r2["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(r2["train"]), type(r2["valid"]))
    assert r2["valid"] == r22

    r2 = gbm.r2(train=True,  valid=False, xval=True)
    assert "train" in r2.keys() and "xval" in r2.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(r2.keys())
    assert len(r2) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(r2.keys())
    assert isinstance(r2["train"], float) and isinstance(r2["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(r2["train"]), type(r2["xval"]))
    assert r2["xval"] == r23

    r2 = gbm.r2(train=True,  valid=True,  xval=True)
    assert "train" in r2.keys() and "valid" in r2.keys() and "xval" in r2.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(r2.keys())
    assert len(r2) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(r2.keys())
    assert isinstance(r2["train"], float) and isinstance(r2["valid"], float) and isinstance(r2["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(r2["train"]), type(r2["valid"]), type(r2["xval"]))

    r2 = gbm.r2(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(r2, float)
    assert r2 == r21

    r2 = gbm.r2(train=False, valid=True,  xval=True)
    assert "valid" in r2.keys() and "xval" in r2.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(r2.keys())
    assert len(r2) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(r2.keys())
    assert isinstance(r2["valid"], float) and isinstance(r2["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(r2["valid"]), type(r2["xval"]))

    #   mean_residual_deviance
    mean_residual_deviance1 = gbm.mean_residual_deviance(train=True,  valid=False, xval=False)
    assert isinstance(mean_residual_deviance1, float)

    mean_residual_deviance2 = gbm.mean_residual_deviance(train=False, valid=True,  xval=False)
    assert isinstance(mean_residual_deviance2, float)

    mean_residual_deviance3 = gbm.mean_residual_deviance(train=False, valid=False, xval=True)
    assert isinstance(mean_residual_deviance3, float)

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,  valid=True,  xval=False)
    assert "train" in mean_residual_deviance.keys() and "valid" in mean_residual_deviance.keys(), "expected training and validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys())
    assert len(mean_residual_deviance) == 2, "expected only training and validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(mean_residual_deviance["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"]))
    assert mean_residual_deviance["valid"] == mean_residual_deviance2

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,  valid=False, xval=True)
    assert "train" in mean_residual_deviance.keys() and "xval" in mean_residual_deviance.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys())
    assert len(mean_residual_deviance) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(mean_residual_deviance["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(mean_residual_deviance["train"]), type(mean_residual_deviance["xval"]))
    assert mean_residual_deviance["xval"] == mean_residual_deviance3

    mean_residual_deviance = gbm.mean_residual_deviance(train=True,  valid=True,  xval=True)
    assert "train" in mean_residual_deviance.keys() and "valid" in mean_residual_deviance.keys() and "xval" in mean_residual_deviance.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys())
    assert len(mean_residual_deviance) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["train"], float) and isinstance(mean_residual_deviance["valid"], float) and isinstance(mean_residual_deviance["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(mean_residual_deviance["train"]), type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"]))

    mean_residual_deviance = gbm.mean_residual_deviance(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(mean_residual_deviance, float)
    assert mean_residual_deviance == mean_residual_deviance1

    mean_residual_deviance = gbm.mean_residual_deviance(train=False, valid=True,  xval=True)
    assert "valid" in mean_residual_deviance.keys() and "xval" in mean_residual_deviance.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys())
    assert len(mean_residual_deviance) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(mean_residual_deviance.keys())
    assert isinstance(mean_residual_deviance["valid"], float) and isinstance(mean_residual_deviance["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(mean_residual_deviance["valid"]), type(mean_residual_deviance["xval"]))


    # binomial
    cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "economy_20mpg"
    distribution = "bernoulli"
    predictors = ["displacement","power","weight","acceleration","year"]
    gbm = h2o.gbm(y=train[response_col], x=train[predictors], validation_y=valid[response_col], validation_x=valid[predictors], nfolds=3, distribution=distribution, fold_assignment="Random")

    #   auc
    auc1 = gbm.auc(train=True,  valid=False, xval=False)
    assert isinstance(auc1, float)

    auc2 = gbm.auc(train=False, valid=True,  xval=False)
    assert isinstance(auc2, float)

    auc3 = gbm.auc(train=False, valid=False, xval=True)
    assert isinstance(auc3, float)

    auc = gbm.auc(train=True,  valid=True,  xval=False)
    assert "train" in auc.keys() and "valid" in auc.keys(), "expected training and validation metrics to be returned, but got {0}".format(auc.keys())
    assert len(auc) == 2, "expected only training and validation metrics to be returned, but got {0}".format(auc.keys())
    assert isinstance(auc["train"], float) and isinstance(auc["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(auc["train"]), type(auc["valid"]))
    assert auc["valid"] == auc2

    auc = gbm.auc(train=True,  valid=False, xval=True)
    assert "train" in auc.keys() and "xval" in auc.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(auc.keys())
    assert len(auc) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(auc.keys())
    assert isinstance(auc["train"], float) and isinstance(auc["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(auc["train"]), type(auc["xval"]))
    assert auc["xval"] == auc3

    auc = gbm.auc(train=True,  valid=True,  xval=True)
    assert "train" in auc.keys() and "valid" in auc.keys() and "xval" in auc.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(auc.keys())
    assert len(auc) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(auc.keys())
    assert isinstance(auc["train"], float) and isinstance(auc["valid"], float) and isinstance(auc["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(auc["train"]), type(auc["valid"]), type(auc["xval"]))

    auc = gbm.auc(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(auc, float)
    assert auc == auc1

    auc = gbm.auc(train=False, valid=True,  xval=True)
    assert "valid" in auc.keys() and "xval" in auc.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(auc.keys())
    assert len(auc) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(auc.keys())
    assert isinstance(auc["valid"], float) and isinstance(auc["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(auc["valid"]), type(auc["xval"]))

    #   logloss
    logloss1 = gbm.logloss(train=True,  valid=False, xval=False)
    assert isinstance(logloss1, float)

    logloss2 = gbm.logloss(train=False, valid=True,  xval=False)
    assert isinstance(logloss2, float)

    logloss3 = gbm.logloss(train=False, valid=False, xval=True)
    assert isinstance(logloss3, float)

    logloss = gbm.logloss(train=True,  valid=True,  xval=False)
    assert "train" in logloss.keys() and "valid" in logloss.keys(), "expected training and validation metrics to be returned, but got {0}".format(logloss.keys())
    assert len(logloss) == 2, "expected only training and validation metrics to be returned, but got {0}".format(logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(logloss["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(logloss["train"]), type(logloss["valid"]))
    assert logloss["valid"] == logloss2

    logloss = gbm.logloss(train=True,  valid=False, xval=True)
    assert "train" in logloss.keys() and "xval" in logloss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert len(logloss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(logloss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(logloss["train"]), type(logloss["xval"]))
    assert logloss["xval"] == logloss3

    logloss = gbm.logloss(train=True,  valid=True,  xval=True)
    assert "train" in logloss.keys() and "valid" in logloss.keys() and "xval" in logloss.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert len(logloss) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(logloss["valid"], float) and isinstance(logloss["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"]))

    logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(logloss, float)
    assert logloss == logloss1

    logloss = gbm.logloss(train=False, valid=True,  xval=True)
    assert "valid" in logloss.keys() and "xval" in logloss.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert len(logloss) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert isinstance(logloss["valid"], float) and isinstance(logloss["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(logloss["valid"]), type(logloss["xval"]))

    #   giniCoef
    giniCoef1 = gbm.giniCoef(train=True,  valid=False, xval=False)
    assert isinstance(giniCoef1, float)

    giniCoef2 = gbm.giniCoef(train=False, valid=True,  xval=False)
    assert isinstance(giniCoef2, float)

    giniCoef3 = gbm.giniCoef(train=False, valid=False, xval=True)
    assert isinstance(giniCoef3, float)

    giniCoef = gbm.giniCoef(train=True,  valid=True,  xval=False)
    assert "train" in giniCoef.keys() and "valid" in giniCoef.keys(), "expected training and validation metrics to be returned, but got {0}".format(giniCoef.keys())
    assert len(giniCoef) == 2, "expected only training and validation metrics to be returned, but got {0}".format(giniCoef.keys())
    assert isinstance(giniCoef["train"], float) and isinstance(giniCoef["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(giniCoef["train"]), type(giniCoef["valid"]))
    assert giniCoef["valid"] == giniCoef2

    giniCoef = gbm.giniCoef(train=True,  valid=False, xval=True)
    assert "train" in giniCoef.keys() and "xval" in giniCoef.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(giniCoef.keys())
    assert len(giniCoef) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(giniCoef.keys())
    assert isinstance(giniCoef["train"], float) and isinstance(giniCoef["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(giniCoef["train"]), type(giniCoef["xval"]))
    assert giniCoef["xval"] == giniCoef3

    giniCoef = gbm.giniCoef(train=True,  valid=True,  xval=True)
    assert "train" in giniCoef.keys() and "valid" in giniCoef.keys() and "xval" in giniCoef.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(giniCoef.keys())
    assert len(giniCoef) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(giniCoef.keys())
    assert isinstance(giniCoef["train"], float) and isinstance(giniCoef["valid"], float) and isinstance(giniCoef["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(giniCoef["train"]), type(giniCoef["valid"]), type(giniCoef["xval"]))

    giniCoef = gbm.giniCoef(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(giniCoef, float)
    assert giniCoef == giniCoef1

    giniCoef = gbm.giniCoef(train=False, valid=True,  xval=True)
    assert "valid" in giniCoef.keys() and "xval" in giniCoef.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(giniCoef.keys())
    assert len(giniCoef) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(giniCoef.keys())
    assert isinstance(giniCoef["valid"], float) and isinstance(giniCoef["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(giniCoef["valid"]), type(giniCoef["xval"]))

    #   F1
    F11 = gbm.F1(train=True,  valid=False, xval=False)
    F12 = gbm.F1(train=False, valid=True,  xval=False)
    F13 = gbm.F1(train=False, valid=False, xval=True)
    F1 = gbm.F1(train=True,  valid=True,  xval=False)
    F1 = gbm.F1(train=True,  valid=False, xval=True)
    F1 = gbm.F1(train=True,  valid=True,  xval=True)
    F1 = gbm.F1(train=False, valid=False, xval=False) # default: return training metrics
    F1 = gbm.F1(train=False, valid=True,  xval=True)

    #   F0point5
    F0point51 = gbm.F0point5(train=True,  valid=False, xval=False)
    F0point52 = gbm.F0point5(train=False, valid=True,  xval=False)
    F0point53 = gbm.F0point5(train=False, valid=False, xval=True)
    F0point5 = gbm.F0point5(train=True,  valid=True,  xval=False)
    F0point5 = gbm.F0point5(train=True,  valid=False, xval=True)
    F0point5 = gbm.F0point5(train=True,  valid=True,  xval=True)
    F0point5 = gbm.F0point5(train=False, valid=False, xval=False) # default: return training metrics
    F0point5 = gbm.F0point5(train=False, valid=True,  xval=True)

    #   F2
    F21 = gbm.F2(train=True,  valid=False, xval=False)
    F22 = gbm.F2(train=False, valid=True,  xval=False)
    F23 = gbm.F2(train=False, valid=False, xval=True)
    F2 = gbm.F2(train=True,  valid=True,  xval=False)
    F2 = gbm.F2(train=True,  valid=False, xval=True)
    F2 = gbm.F2(train=True,  valid=True,  xval=True)
    F2 = gbm.F2(train=False, valid=False, xval=False) # default: return training metrics
    F2 = gbm.F2(train=False, valid=True,  xval=True)

    #   accuracy
    accuracy1 = gbm.accuracy(train=True,  valid=False, xval=False)
    accuracy2 = gbm.accuracy(train=False, valid=True,  xval=False)
    accuracy3 = gbm.accuracy(train=False, valid=False, xval=True)
    accuracy = gbm.accuracy(train=True,  valid=True,  xval=False)
    accuracy = gbm.accuracy(train=True,  valid=False, xval=True)
    accuracy = gbm.accuracy(train=True,  valid=True,  xval=True)
    accuracy = gbm.accuracy(train=False, valid=False, xval=False) # default: return training metrics
    accuracy = gbm.accuracy(train=False, valid=True,  xval=True)

    #   error
    error1 = gbm.error(train=True,  valid=False, xval=False)
    error2 = gbm.error(train=False, valid=True,  xval=False)
    error3 = gbm.error(train=False, valid=False, xval=True)
    error = gbm.error(train=True,  valid=True,  xval=False)
    error = gbm.error(train=True,  valid=False, xval=True)
    error = gbm.error(train=True,  valid=True,  xval=True)
    error = gbm.error(train=False, valid=False, xval=False) # default: return training metrics
    error = gbm.error(train=False, valid=True,  xval=True)

    #   precision
    precision1 = gbm.precision(train=True,  valid=False, xval=False)
    precision2 = gbm.precision(train=False, valid=True,  xval=False)
    precision3 = gbm.precision(train=False, valid=False, xval=True)
    precision = gbm.precision(train=True,  valid=True,  xval=False)
    precision = gbm.precision(train=True,  valid=False, xval=True)
    precision = gbm.precision(train=True,  valid=True,  xval=True)
    precision = gbm.precision(train=False, valid=False, xval=False) # default: return training metrics
    precision = gbm.precision(train=False, valid=True,  xval=True)

    #   mcc
    mcc1 = gbm.mcc(train=True,  valid=False, xval=False)
    mcc2 = gbm.mcc(train=False, valid=True,  xval=False)
    mcc3 = gbm.mcc(train=False, valid=False, xval=True)
    mcc = gbm.mcc(train=True,  valid=True,  xval=False)
    mcc = gbm.mcc(train=True,  valid=False, xval=True)
    mcc = gbm.mcc(train=True,  valid=True,  xval=True)
    mcc = gbm.mcc(train=False, valid=False, xval=False) # default: return training metrics
    mcc = gbm.mcc(train=False, valid=True,  xval=True)

    #   max_per_class_error
    max_per_class_error1 = gbm.max_per_class_error(train=True,  valid=False, xval=False)
    max_per_class_error2 = gbm.max_per_class_error(train=False, valid=True,  xval=False)
    max_per_class_error3 = gbm.max_per_class_error(train=False, valid=False, xval=True)
    max_per_class_error = gbm.max_per_class_error(train=True,  valid=True,  xval=False)
    max_per_class_error = gbm.max_per_class_error(train=True,  valid=False, xval=True)
    max_per_class_error = gbm.max_per_class_error(train=True,  valid=True,  xval=True)
    max_per_class_error = gbm.max_per_class_error(train=False, valid=False, xval=False) # default: return training metrics
    max_per_class_error = gbm.max_per_class_error(train=False, valid=True,  xval=True)

    #   confusion_matrix
    confusion_matrix1 = gbm.confusion_matrix(train=True,  valid=False, xval=False)
    confusion_matrix2 = gbm.confusion_matrix(train=False, valid=True,  xval=False)
    confusion_matrix3 = gbm.confusion_matrix(train=False, valid=False, xval=True)
    confusion_matrix = gbm.confusion_matrix(train=True,  valid=True,  xval=False)
    confusion_matrix = gbm.confusion_matrix(train=True,  valid=False, xval=True)
    confusion_matrix = gbm.confusion_matrix(train=True,  valid=True,  xval=True)
    confusion_matrix = gbm.confusion_matrix(train=False, valid=False, xval=False) # default: return training metrics
    confusion_matrix = gbm.confusion_matrix(train=False, valid=True,  xval=True)


    # #   plot
    # plot1 = gbm.plot(train=True,  valid=False, xval=False)
    # plot2 = gbm.plot(train=False, valid=True,  xval=False)
    # plot3 = gbm.plot(train=False, valid=False, xval=True)
    # plot = gbm.plot(train=True,  valid=True,  xval=False)
    # plot = gbm.plot(train=True,  valid=False, xval=True)
    # plot = gbm.plot(train=True,  valid=True,  xval=True)
    # plot = gbm.plot(train=False, valid=False, xval=False) # default: return training metrics
    # plot = gbm.plot(train=False, valid=True,  xval=True)

    # #   tpr
    # tpr1 = gbm.tpr(train=True,  valid=False, xval=False)
    # tpr2 = gbm.tpr(train=False, valid=True,  xval=False)
    # tpr3 = gbm.tpr(train=False, valid=False, xval=True)
    # tpr = gbm.tpr(train=True,  valid=True,  xval=False)
    # tpr = gbm.tpr(train=True,  valid=False, xval=True)
    # tpr = gbm.tpr(train=True,  valid=True,  xval=True)
    # tpr = gbm.tpr(train=False, valid=False, xval=False) # default: return training metrics
    # tpr = gbm.tpr(train=False, valid=True,  xval=True)
    #
    # #   tnr
    # tnr1 = gbm.tnr(train=True,  valid=False, xval=False)
    # tnr2 = gbm.tnr(train=False, valid=True,  xval=False)
    # tnr3 = gbm.tnr(train=False, valid=False, xval=True)
    # tnr = gbm.tnr(train=True,  valid=True,  xval=False)
    # tnr = gbm.tnr(train=True,  valid=False, xval=True)
    # tnr = gbm.tnr(train=True,  valid=True,  xval=True)
    # tnr = gbm.tnr(train=False, valid=False, xval=False) # default: return training metrics
    # tnr = gbm.tnr(train=False, valid=True,  xval=True)
    #
    # #   fnr
    # fnr1 = gbm.fnr(train=True,  valid=False, xval=False)
    # fnr2 = gbm.fnr(train=False, valid=True,  xval=False)
    # fnr3 = gbm.fnr(train=False, valid=False, xval=True)
    # fnr = gbm.fnr(train=True,  valid=True,  xval=False)
    # fnr = gbm.fnr(train=True,  valid=False, xval=True)
    # fnr = gbm.fnr(train=True,  valid=True,  xval=True)
    # fnr = gbm.fnr(train=False, valid=False, xval=False) # default: return training metrics
    # fnr = gbm.fnr(train=False, valid=True,  xval=True)
    #
    # #   fpr
    # fpr1 = gbm.fpr(train=True,  valid=False, xval=False)
    # fpr2 = gbm.fpr(train=False, valid=True,  xval=False)
    # fpr3 = gbm.fpr(train=False, valid=False, xval=True)
    # fpr = gbm.fpr(train=True,  valid=True,  xval=False)
    # fpr = gbm.fpr(train=True,  valid=False, xval=True)
    # fpr = gbm.fpr(train=True,  valid=True,  xval=True)
    # fpr = gbm.fpr(train=False, valid=False, xval=False) # default: return training metrics
    # fpr = gbm.fpr(train=False, valid=True,  xval=True)


    # multinomial
    cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars["cylinders"] = cars["cylinders"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "cylinders"
    distribution = "multinomial"
    predictors = ["displacement","power","weight","acceleration","year"]
    gbm = h2o.gbm(y=train[response_col],
                  x=train[predictors],
                  validation_y=valid[response_col],
                  validation_x=valid[predictors],
                  nfolds=3,
                  distribution=distribution,
                  fold_assignment="Random")

    #   mse
    mse1 = gbm.mse(train=True,  valid=False, xval=False)
    assert isinstance(mse1, float)

    mse2 = gbm.mse(train=False, valid=True,  xval=False)
    assert isinstance(mse2, float)

    mse3 = gbm.mse(train=False, valid=False, xval=True)
    assert isinstance(mse3, float)

    mse = gbm.mse(train=True,  valid=True,  xval=False)
    assert "train" in mse.keys() and "valid" in mse.keys(), "expected training and validation metrics to be returned, but got {0}".format(mse.keys())
    assert len(mse) == 2, "expected only training and validation metrics to be returned, but got {0}".format(mse.keys())
    assert isinstance(mse["train"], float) and isinstance(mse["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(mse["train"]), type(mse["valid"]))
    assert mse["valid"] == mse2

    mse = gbm.mse(train=True,  valid=False, xval=True)
    assert "train" in mse.keys() and "xval" in mse.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert len(mse) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert isinstance(mse["train"], float) and isinstance(mse["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(mse["train"]), type(mse["xval"]))
    assert mse["xval"] == mse3

    mse = gbm.mse(train=True,  valid=True,  xval=True)
    assert "train" in mse.keys() and "valid" in mse.keys() and "xval" in mse.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert len(mse) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert isinstance(mse["train"], float) and isinstance(mse["valid"], float) and isinstance(mse["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(mse["train"]), type(mse["valid"]), type(mse["xval"]))

    mse = gbm.mse(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(mse, float)
    assert mse == mse1

    mse = gbm.mse(train=False, valid=True,  xval=True)
    assert "valid" in mse.keys() and "xval" in mse.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert len(mse) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(mse.keys())
    assert isinstance(mse["valid"], float) and isinstance(mse["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(mse["valid"]), type(mse["xval"]))

    #   logloss
    logloss1 = gbm.logloss(train=True,  valid=False, xval=False)
    assert isinstance(logloss1, float)

    logloss2 = gbm.logloss(train=False, valid=True,  xval=False)
    assert isinstance(logloss2, float)

    logloss3 = gbm.logloss(train=False, valid=False, xval=True)
    assert isinstance(logloss3, float)

    logloss = gbm.logloss(train=True,  valid=True,  xval=False)
    assert "train" in logloss.keys() and "valid" in logloss.keys(), "expected training and validation metrics to be returned, but got {0}".format(logloss.keys())
    assert len(logloss) == 2, "expected only training and validation metrics to be returned, but got {0}".format(logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(logloss["valid"], float), "expected training and validation metrics to be floats, but got {0} and {1}".format(type(logloss["train"]), type(logloss["valid"]))
    assert logloss["valid"] == logloss2

    logloss = gbm.logloss(train=True,  valid=False, xval=True)
    assert "train" in logloss.keys() and "xval" in logloss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert len(logloss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(logloss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(logloss["train"]), type(logloss["xval"]))
    assert logloss["xval"] == logloss3

    logloss = gbm.logloss(train=True,  valid=True,  xval=True)
    assert "train" in logloss.keys() and "valid" in logloss.keys() and "xval" in logloss.keys(), "expected training, validation, and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert len(logloss) == 3, "expected training, validation and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert isinstance(logloss["train"], float) and isinstance(logloss["valid"], float) and isinstance(logloss["xval"], float), "expected training, validation, and cross validation metrics to be floats, but got {0}, {1}, and {2}".format(type(logloss["train"]), type(logloss["valid"]), type(logloss["xval"]))

    logloss = gbm.logloss(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(logloss, float)
    assert logloss == logloss1

    logloss = gbm.logloss(train=False, valid=True,  xval=True)
    assert "valid" in logloss.keys() and "xval" in logloss.keys(), "expected validation and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert len(logloss) == 2, "expected validation and cross validation metrics to be returned, but got {0}".format(logloss.keys())
    assert isinstance(logloss["valid"], float) and isinstance(logloss["xval"], float), "validation and cross validation metrics to be floats, but got {0} and {1}".format(type(logloss["valid"]), type(logloss["xval"]))

    #   hit_ratio_table
    hit_ratio_table1 = gbm.hit_ratio_table(train=True,  valid=False, xval=False)
    hit_ratio_table2 = gbm.hit_ratio_table(train=False, valid=True,  xval=False)
    hit_ratio_table3 = gbm.hit_ratio_table(train=False, valid=False, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(train=True,  valid=True,  xval=False)
    hit_ratio_table = gbm.hit_ratio_table(train=True,  valid=False, xval=True)
    hit_ratio_table = gbm.hit_ratio_table(train=True,  valid=True,  xval=True)
    hit_ratio_table = gbm.hit_ratio_table(train=False, valid=False, xval=False) # default: return training metrics
    hit_ratio_table = gbm.hit_ratio_table(train=False, valid=True,  xval=True)


    # clustering
    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
    km = h2o.kmeans(x=iris[0:4],
                    nfolds=3,
                    k=3)

    #   betweenss
    betweenss1 = km.betweenss(train=True,  valid=False, xval=False)
    assert isinstance(betweenss1, float)

    betweenss3 = km.betweenss(train=False, valid=False, xval=True)
    assert isinstance(betweenss3, float)

    betweenss = km.betweenss(train=True,  valid=False, xval=True)
    assert "train" in betweenss.keys() and "xval" in betweenss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(betweenss.keys())
    assert len(betweenss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(betweenss.keys())
    assert isinstance(betweenss["train"], float) and isinstance(betweenss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(betweenss["train"]), type(betweenss["xval"]))
    assert betweenss["xval"] == betweenss3

    betweenss = km.betweenss(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(betweenss, float)
    assert betweenss == betweenss1

    #   totss
    totss1 = km.totss(train=True,  valid=False, xval=False)
    assert isinstance(totss1, float)

    totss3 = km.totss(train=False, valid=False, xval=True)
    assert isinstance(totss3, float)

    totss = km.totss(train=True,  valid=False, xval=True)
    assert "train" in totss.keys() and "xval" in totss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(totss.keys())
    assert len(totss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(totss.keys())
    assert isinstance(totss["train"], float) and isinstance(totss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(totss["train"]), type(totss["xval"]))
    assert totss["xval"] == totss3

    totss = km.totss(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(totss, float)
    assert totss == totss1

    #   tot_withinss
    tot_withinss1 = km.tot_withinss(train=True,  valid=False, xval=False)
    assert isinstance(tot_withinss1, float)

    tot_withinss3 = km.tot_withinss(train=False, valid=False, xval=True)
    assert isinstance(tot_withinss3, float)

    tot_withinss = km.tot_withinss(train=True,  valid=False, xval=True)
    assert "train" in tot_withinss.keys() and "xval" in tot_withinss.keys(), "expected training and cross validation metrics to be returned, but got {0}".format(tot_withinss.keys())
    assert len(tot_withinss) == 2, "expected only training and cross validation metrics to be returned, but got {0}".format(tot_withinss.keys())
    assert isinstance(tot_withinss["train"], float) and isinstance(tot_withinss["xval"], float), "expected training and cross validation metrics to be floats, but got {0} and {1}".format(type(tot_withinss["train"]), type(tot_withinss["xval"]))
    assert tot_withinss["xval"] == tot_withinss3

    tot_withinss = km.tot_withinss(train=False, valid=False, xval=False) # default: return training metrics
    assert isinstance(tot_withinss, float)
    assert tot_withinss == tot_withinss1

    #   withinss
    withinss1 = km.withinss(train=True,  valid=False, xval=False)
    withinss3 = km.withinss(train=False, valid=False, xval=True)
    withinss = km.withinss(train=True,  valid=False, xval=True)
    withinss = km.withinss(train=False, valid=False, xval=False) # default: return training metrics

    #   centroid_stats
    centroid_stats1 = km.centroid_stats(train=True,  valid=False, xval=False)
    centroid_stats3 = km.centroid_stats(train=False, valid=False, xval=True)
    centroid_stats = km.centroid_stats(train=True,  valid=False, xval=True)
    centroid_stats = km.centroid_stats(train=False, valid=False, xval=False) # default: return training metrics

    #   size
    size1 = km.size(train=True,  valid=False, xval=False)
    size3 = km.size(train=False, valid=False, xval=True)
    size = km.size(train=True,  valid=False, xval=True)
    size = km.size(train=False, valid=False, xval=False) # default: return training metrics
Exemplo n.º 53
0
def init_err_casesKmeans(ip, port):
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(
        path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()
    numcol = benign_h2o.ncol
    numrow = benign_h2o.nrow

    # Log.info("Non-numeric entry that isn't 'Random', 'PlusPlus', or 'Furthest'")
    try:
        h2o.kmeans(x=benign_h2o, k=5, init='Test123')
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Log.info("Empty list, tuple, or dictionary")
    try:
        h2o.kmeans(x=benign_h2o, k=0, user_points=[])
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.kmeans(x=benign_h2o, k=0, user_points=())
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.kmeans(x=benign_h2o, k=0, user_points={})
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Log.info("Number of columns doesn't equal training set's")
    start_small = [[random.gauss(0, 1) for c in range(numcol - 2)]
                   for r in range(5)]
    start_large = [[random.gauss(0, 1) for c in range(numcol + 2)]
                   for r in range(5)]

    try:
        h2o.kmeans(x=benign_h2o, k=5, user_points=h2o.H2OFrame(start_small))
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.kmeans(x=benign_h2o, k=5, user_points=h2o.H2OFrame(start_large))
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Log.info("Number of rows exceeds training set's")
    start = [[random.gauss(0, 1) for c in range(numcol)]
             for r in range(numrow + 2)]
    try:
        h2o.kmeans(x=benign_h2o, k=numrow + 2, user_points=h2o.H2OFrame(start))
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    # Nones are replaced with mean of a column in H2O. Not sure about Inf.
    # Log.info("Any entry is NA, NaN, or Inf")
    start = [[random.gauss(0, 1) for c in range(numcol)] for r in range(3)]
    for x in ["NA", "NaN", "Inf", "-Inf"]:
        start_err = start[:]
        start_err[1][random.randint(0, numcol - 1)] = x
        h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start_err))

    # Duplicates will affect sampling probability during initialization.
    # Log.info("Duplicate initial clusters specified")
    start = [[random.gauss(0, 1) for c in range(numcol)] for r in range(3)]
    start[2] = start[0]
    h2o.kmeans(x=benign_h2o, k=3, user_points=h2o.H2OFrame(start))
Exemplo n.º 54
0
def baddataKmeans(ip,port):

  # Connect to a pre-existing cluster
    # connect to localhost:54321

  rows = 100
  cols = 10
  rawdata = [[random.random() for c in range(cols)] for r in range(rows)]

  # Row elements that are None will be replaced with mean of column
  #Log.info("Training data with 1 row of all Nones: replace with column mean")
  data = rawdata[:]
  for cidx, cval in enumerate(data[24]):
    data[24][cidx] = None
  frame = h2o.H2OFrame(data)

  km_model = h2o.kmeans(x=frame, k=5)

  centers = km_model.centers()
  assert len(centers) == 5, "expected 5 centers"
  for c in range(len(centers)):
    assert len(centers[c]) == 10, "expected center to be 10 dimensional"

  # Columns with constant value will be automatically dropped
  #Log.info("Training data with 1 col of all 5's: drop automatically")
  data = rawdata[:]
  for idx, val in enumerate(data):
    data[idx][4] = 5
  frame = h2o.H2OFrame(data)

  km_model = h2o.kmeans(x=frame, k=5)

  centers = km_model.centers()
  assert len(centers) == 5, "expected 5 centers"
  for c in range(len(centers)):
    assert len(centers[c]) == 9, "expected center to be 9 "
  # TODO: expect_warning(km_model = h2o.kmeans(x=frame, k=5))

  # Log.info("Training data with 1 col of all None's, 1 col of all zeroes: drop automatically")
  data = rawdata[:]
  for idx, val in enumerate(data):
    data[idx][4] = None
    data[idx][7] = 0
  frame = h2o.H2OFrame(data)

  km_model = h2o.kmeans(x=frame, k=5)

  centers = km_model.centers()
  assert len(centers) == 5, "expected 5 centers"
  for c in range(len(centers)):
    assert len(centers[c]) == 8, "expected center to be 9 "
  # TODO: expect_warning(km_model = h2o.kmeans(x=frame, k=5))

  # Log.info("Training data with all None's")
  data = [[None for c in range(cols)] for r in range(rows)]
  frame = h2o.H2OFrame(data)

  try:
    h2o.kmeans(x=frame, k=5)
    assert False, "expected an error"
  except EnvironmentError:
    assert True

  # Log.info("Training data with a categorical column(s)")
  data = [[random.choice(string.ascii_uppercase) for c in range(cols)] for r in range(rows)]
  frame = h2o.H2OFrame(data)

  km_model = h2o.kmeans(x=frame, k=5)
  centers = km_model.centers()
  assert len(centers) == 5, "expected 5 centers"
  for c in range(len(centers)):
    assert len(centers[c]) == 10, "expected center to be 10 "+str(len(centers[c]))

  # Log.info("Importing iris.csv data...\n")
  iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

  km_model = h2o.kmeans(x=iris, k=5)
  centers = km_model.centers()
  assert len(centers) == 5, "expected 5 centers"
  for c in range(len(centers)):
    assert len(centers[c]) == 5, "expected center to be 5 "+str(len(centers[c]))