Exemplo n.º 1
0
def hdfs_kmeans_airlines():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_file = "/datasets/airlines_all.csv"

        print "Import airlines_all.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_file(url)
        n = airlines_h2o.nrow
        print "rows: {0}".format(n)

        print "Run k-means++ with k = 7 and max_iterations = 10"
        myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9]
        airlines_km = h2o.kmeans(training_frame=airlines_h2o,
                                 x=airlines_h2o[myX],
                                 k=7,
                                 init="Furthest",
                                 max_iterations=10,
                                 standardize=True)
        print airlines_km
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 2
0
def directory_import():

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        url1 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv")
        url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/")
        print "Importing HDFS file {0} and directory {1}".format(url1, url2)
        frm_one = h2o.import_file(url1)
        frm_all = h2o.import_file(url2)

        r1, c1 = frm_one.dim
        ra, ca = frm_all.dim

        assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
        assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)

    small1 = h2o.locate("smalldata/jira/identical_files/iris1.csv")
    small2 = small1.split("iris1.csv")[0]
    print "Importing smalldata file {0} and directory {1}".format(small1, small2)
    frm_one = h2o.import_file(small1)
    frm_all = h2o.import_file(small2)

    r1, c1 = frm_one.dim
    ra, ca = frm_all.dim

    assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
    assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
Exemplo n.º 3
0
def pubdev_1431(ip, port):

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file_1 = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1)
        airlines_billion_1 = h2o.import_file(url)

        airlines_billion_1[30] = airlines_billion_1[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1)

        predictions = gbm.predict(airlines_billion_1)

        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)

        airlines_billion_2 = h2o.import_file(csv)
        os.remove(csv)

        r1, c1 = airlines_billion_1.dim
        r2, c2 = airlines_billion_2.dim
        assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \
                                      "c2: {1}".format(r1,r2,c1,c2)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 4
0
def pubdev_1431(ip, port):

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file_1 = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1)
        airlines_billion_1 = h2o.import_file(url)

        airlines_billion_1[30] = airlines_billion_1[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion_1[0:30],
                      y=airlines_billion_1[30],
                      ntrees=1,
                      distribution="bernoulli",
                      max_depth=1)

        predictions = gbm.predict(airlines_billion_1)

        csv = os.path.join(os.getcwd(), "delete.csv")
        h2o.download_csv(predictions, csv)

        airlines_billion_2 = h2o.import_file(csv)
        os.remove(csv)

        r1, c1 = airlines_billion_1.dim
        r2, c2 = airlines_billion_2.dim
        assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \
                                      "c2: {1}".format(r1,r2,c1,c2)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 5
0
def hdfs_basic():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_iris_dir = "/datasets/runit/iris_test_train"

        #----------------------------------------------------------------------
        # Single file cases.
        #----------------------------------------------------------------------

        print "Testing single file importHDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        iris_h2o.head()
        iris_h2o.tail()
        n = iris_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 150)
        assert isinstance(
            iris_h2o,
            h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format(
                type(iris_h2o))
        print "Import worked"

        #----------------------------------------------------------------------
        # Directory file cases.
        #----------------------------------------------------------------------

        print "Testing directory importHDFS"
        urls = [
            "hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node,
                                                 hdfs_iris_dir),
            "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node,
                                                  hdfs_iris_dir)
        ]
        iris_dir_h2o = h2o.import_file(urls)
        iris_dir_h2o.head()
        iris_dir_h2o.tail()
        n = iris_dir_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 150)
        assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\
            format(type(iris_dir_h2o))
        print "Import worked"
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 6
0
def pubdev_1421():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_airlines_test_file = "/datasets/airlines.test.csv"

        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_test_file)
        air_test = h2o.import_file(url)
def kmeans_mllib(ip, port):

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow

        err_mllib = np.genfromtxt(
            h2o.locate("smalldata/mllib_bench/bigcross_wcsse.csv"),
            delimiter=",",
            skip_header=1)
        ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))]

        for k in ncent:
            print "Run k-means++ with k = {0} and max_iterations = 10".format(
                k)
            cross_km = h2o.kmeans(training_frame=cross_h2o,
                                  x=cross_h2o,
                                  k=k,
                                  init="PlusPlus",
                                  max_iterations=10,
                                  standardize=False)

            clust_mllib = np.genfromtxt(
                h2o.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) +
                           ".csv"),
                delimiter=",").tolist()
            clust_h2o = cross_km.centers()

            # Sort in ascending order by first dimension for comparison purposes
            clust_mllib.sort(key=lambda x: x[0])
            clust_h2o.sort(key=lambda x: x[0])

            print "\nMLlib Cluster Centers:\n"
            print clust_mllib
            print "\nH2O Cluster Centers:\n"
            print clust_h2o

            wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1]
            wcsse_h2o = cross_km.tot_withinss() / n
            print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib)
            print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o)
            assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \
                                             "got {1}".format(wcsse_mllib, wcsse_h2o)
Exemplo n.º 8
0
def pubdev_1421():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_airlines_test_file  = "/datasets/airlines.test.csv"

        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_test_file)
        air_test = h2o.import_file(url)
def hdfs_kmeans_converge():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow
        print "rows: {0}".format(n)
        ncent = 3
        miters = 10

        print "Run k-means with k = {0} and max_iterations = {1}".format(
            ncent, miters)
        cross1_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               max_iterations=miters)
        print cross1_km

        print "Run k-means with init = final cluster centers and max_iterations = 1"
        init_centers = h2o.H2OFrame(cross1_km.centers())
        init_centers_key = init_centers.send_frame()
        cross2_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               user_points=init_centers_key,
                               max_iterations=1)
        print cross2_km

        print "Check k-means converged or maximum iterations reached"
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = ((c1 - c2)**2).sum() / ncent
        iters = cross1_km._model_json['output']['model_summary'].cell_values[
            0][3]
        assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \
                                                    "{0} and iterations = {1}".format(avg_change, iters)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 10
0
def pubdev_1431(ip, port):

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)
        os.remove(csv)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 11
0
def pubdev_1431():

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)
        os.remove(csv)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 12
0
def hdfs_basic():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_iris_dir  = "/datasets/runit/iris_test_train"

        #----------------------------------------------------------------------
        # Single file cases.
        #----------------------------------------------------------------------

        print "Testing single file importHDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        iris_h2o.head()
        iris_h2o.tail()
        n = iris_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)
        assert isinstance(iris_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format(type(iris_h2o))
        print "Import worked"

        #----------------------------------------------------------------------
        # Directory file cases.
        #----------------------------------------------------------------------

        print "Testing directory importHDFS"
        urls = ["hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir),
                "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir)]
        iris_dir_h2o = h2o.import_file(urls)
        iris_dir_h2o.head()
        iris_dir_h2o.tail()
        n = iris_dir_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)
        assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\
            format(type(iris_dir_h2o))
        print "Import worked"
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 13
0
def hdfs_kmeans(ip, port):

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_covtype_file = "/datasets/runit/covtype.data"

        print "Import iris_wheader.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        n = iris_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 150)

        print "Running KMeans on iris"
        iris_km = h2o.kmeans(training_frame=iris_h2o,
                             k=3,
                             x=iris_h2o[0:4],
                             max_iterations=10)
        print iris_km

        print "Importing covtype.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
        covtype_h2o = h2o.import_file(url)
        n = covtype_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 581012)

        print "Running KMeans on covtype"
        covtype_km = h2o.kmeans(training_frame=covtype_h2o,
                                x=covtype_h2o[0:55],
                                k=8,
                                max_iterations=10)
        print covtype_km

    else:
        print "Not running on H2O internal network.  No access to HDFS."
def kmeans_mllib():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow

        err_mllib = np.genfromtxt(h2o.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1)
        ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))]

        for k in ncent:
            print "Run k-means++ with k = {0} and max_iterations = 10".format(k)
            cross_km = h2o.kmeans(training_frame = cross_h2o, x = cross_h2o, k = k, init = "PlusPlus",
                                  max_iterations = 10, standardize = False)

            clust_mllib = np.genfromtxt(h2o.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"),
                                        delimiter=",").tolist()
            clust_h2o = cross_km.centers()

            # Sort in ascending order by first dimension for comparison purposes
            clust_mllib.sort(key=lambda x: x[0])
            clust_h2o.sort(key=lambda x: x[0])

            print "\nMLlib Cluster Centers:\n"
            print clust_mllib
            print "\nH2O Cluster Centers:\n"
            print clust_h2o

            wcsse_mllib = err_mllib[err_mllib[0:4,0].tolist().index(k)][1]
            wcsse_h2o = cross_km.tot_withinss() / n
            print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib)
            print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o)
            assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \
                                             "got {1}".format(wcsse_mllib, wcsse_h2o)
def hdfs_kmeans_converge(ip, port):
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow
        print "rows: {0}".format(n)
        ncent = 3
        miters = 10

        print "Run k-means with k = {0} and max_iterations = {1}".format(ncent,miters)
        cross1_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, max_iterations = miters)
        print cross1_km

        print "Run k-means with init = final cluster centers and max_iterations = 1"
        init_centers = h2o.H2OFrame(cross1_km.centers())
        init_centers_key = init_centers.send_frame()
        cross2_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, user_points=init_centers_key,
                               max_iterations = 1)
        print cross2_km

        print "Check k-means converged or maximum iterations reached"
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = ((c1-c2)**2).sum() / ncent
        iters = cross1_km._model_json['output']['model_summary'].cell_values[0][3]
        assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \
                                                    "{0} and iterations = {1}".format(avg_change, iters)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
def hdfs_kmeans_airlines():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_file = "/datasets/airlines_all.csv"

        print "Import airlines_all.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_file(url)
        n = airlines_h2o.nrow
        print "rows: {0}".format(n)

        print "Run k-means++ with k = 7 and max_iterations = 10"
        myX = range(8) + range(11,16) + range(18,21) + range(24,29) + [9]
        airlines_km = h2o.kmeans(training_frame = airlines_h2o, x = airlines_h2o[myX], k = 7, init = "Furthest",
                                 max_iterations = 10, standardize = True)
        print airlines_km
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemplo n.º 17
0
def directory_import():

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        url1 = "hdfs://{0}{1}".format(
            hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv")
        url2 = "hdfs://{0}{1}".format(hdfs_name_node,
                                      "/datasets/iris/identical_iris_files/")
        print "Importing HDFS file {0} and directory {1}".format(url1, url2)
        frm_one = h2o.import_file(url1)
        frm_all = h2o.import_file(url2)

        r1, c1 = frm_one.dim
        ra, ca = frm_all.dim

        assert r1 * 3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(
            r1, ra)
        assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(
            c1, ca)

    small1 = tests.locate("smalldata/jira/identical_files/iris1.csv")
    small2 = small1.split("iris1.csv")[0]
    print "Importing smalldata file {0} and directory {1}".format(
        small1, small2)
    frm_one = h2o.import_file(small1)
    frm_all = h2o.import_file(small2)

    r1, c1 = frm_one.dim
    ra, ca = frm_all.dim

    assert r1 * 3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(
        r1, ra)
    assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(
        c1, ca)
Exemplo n.º 18
0
def hdfs_kmeans(ip, port):
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_covtype_file = "/datasets/runit/covtype.data"

        print "Import iris_wheader.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        n = iris_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)

        print "Running KMeans on iris"
        iris_km = h2o.kmeans(training_frame = iris_h2o, k = 3, x = iris_h2o[0:4], max_iterations = 10)
        print iris_km

        print "Importing covtype.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
        covtype_h2o = h2o.import_file(url)
        n = covtype_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012)

        print "Running KMeans on covtype"
        covtype_km = h2o.kmeans(training_frame = covtype_h2o, x = covtype_h2o[0:55], k = 8, max_iterations = 10)
        print covtype_km

    else:
        print "Not running on H2O internal network.  No access to HDFS."