示例#1
0
    def test_c6_hdfs_fvec(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle",
            "TEST-poker1000.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            # "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            # has duplicated col name
            # "hhp2.os.noisy.0_1.data",
            # "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]
        csvFilenameAll = [
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
        ]

        # find_cloud.py won't set these correctly. Let's just set them here
        # we have two cdh's though. I guess we're going to use whatever got setup
        # h2o.nodes[0].use_maprfs = False
        # h2o.nodes[0].use_hdfs = True
        # h2o.nodes[0].hdfs_version = 'cdh4'
        # h2o.nodes[0].hdfs_name_node = '172.16.2.176'

        h2o.setup_benchmark_log()

        # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        # benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        # pick 8 randomly!
        if DO_RANDOM_SAMPLE:
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # save the first, for all comparisions, to avoid slow drift with each iteration
        importFolderPath = "datasets"
        trial = 0
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            csvPathname = importFolderPath + "/" + csvFilename

            timeoutSecs = 1000
            # do an import first, because we want to get the size of the file
            print "Loading", csvFilename, 'from hdfs'
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema="hdfs",
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False,
                                           benchmarkLogging=benchmarkLogging)
            print "parse result:", parseResult['destination_key']

            elapsed = time.time() - start
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            if DO_RF:
                print "\n" + csvFilename
                start = time.time()
                kwargs = {'ntrees': 1}
                paramsString = json.dumps(kwargs)
                RFview = h2o_cmd.runRF(parseResult=parseResult,
                                       timeoutSecs=2000,
                                       benchmarkLogging=benchmarkLogging,
                                       **kwargs)
                elapsed = time.time() - start
                print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                    (elapsed / timeoutSecs) * 100)

                l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF",
                    "trial " + str(trial), csvFilename, elapsed, paramsString)
                print l
                h2o.cloudPerfH2O.message(l)

            if 1 == 0:
                print "Deleting all keys, to make sure our parse times don't include spills"
                h2i.delete_keys_at_all_nodes()

            trial += 1
    def test_c5_KMeans_sphere_26GB_fvec(self):
        # a kludge
        h2o.setup_benchmark_log()

        # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        csvFilename = "syn_sphere15_gen_26GB.csv"
        # csvFilename = 'syn_sphere_gen_h1m.csv'
        # csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        # csvFilename = 'syn_sphere_gen_h1m_no_na.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        if NA_COL_BUG:
            expected = [
                # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB,
                # so shouldn't be used for 26GB
                # or it should be divided by 7
                # the distribution is the same, obviously.
                (
                    [-113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                    248846122,
                    1308149283316.2988,
                ),
                (
                    [1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                    276924291,
                    1800760152555.98,
                ),
                (
                    [5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                    235089554,
                    375419158808.3253,
                ),
                (
                    [10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                    166180630,
                    525423632323.6474,
                ),
                (
                    [11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                    167234179,
                    1845362026223.1094,
                ),
                (
                    [12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                    195420925,
                    197941282992.43475,
                ),
                (
                    [19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                    214401768,
                    11868360232.658035,
                ),
                (
                    [20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                    258853406,
                    598863991074.3276,
                ),
                (
                    [21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                    190979054,
                    1505088759456.314,
                ),
                (
                    [25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                    87794427,
                    1124697008162.3955,
                ),
                (
                    [39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                    78226988,
                    1151439441529.0215,
                ),
                (
                    [40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                    167273589,
                    693036940951.0249,
                ),
                (
                    [42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                    148426180,
                    35942838893.32379,
                ),
                (
                    [48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                    157533313,
                    88431531357.62982,
                ),
                (
                    [147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                    118361306,
                    1111537045743.7646,
                ),
            ]
        else:
            expected = [
                (
                    [0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                    248846122,
                    1308149283316.2988,
                ),
                (
                    [0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                    276924291,
                    1800760152555.98,
                ),
                (
                    [0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                    235089554,
                    375419158808.3253,
                ),
                (
                    [0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                    166180630,
                    525423632323.6474,
                ),
                (
                    [0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                    167234179,
                    1845362026223.1094,
                ),
                (
                    [0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                    195420925,
                    197941282992.43475,
                ),
                (
                    [0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                    214401768,
                    11868360232.658035,
                ),
                (
                    [0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                    258853406,
                    598863991074.3276,
                ),
                (
                    [0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                    190979054,
                    1505088759456.314,
                ),
                (
                    [0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                    87794427,
                    1124697008162.3955,
                ),
                (
                    [0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                    78226988,
                    1151439441529.0215,
                ),
                (
                    [0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                    167273589,
                    693036940951.0249,
                ),
                (
                    [0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                    148426180,
                    35942838893.32379,
                ),
                (
                    [0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                    157533313,
                    88431531357.62982,
                ),
                (
                    [0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                    118361306,
                    1111537045743.7646,
                ),
            ]

        benchmarkLogging = ["cpu", "disk", "network", "iostats", "jstack"]
        benchmarkLogging = ["cpu", "disk", "network", "iostats"]
        # IOStatus can hang?
        benchmarkLogging = ["cpu", "disk", "network"]
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="hdfs",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs
                )
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="local",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs
                )

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "Parse", csvPathname, fileMBS, elapsed
            )
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], timeoutSecs=300)
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summary = h2o_cmd.runSummary(
                key=parseResult["destination_key"], numRows=numRows, numCols=numCols, timeoutSecs=300
            )
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                "k": 15,
                "max_iter": 500,
                # 'normalize': 1,
                "normalize": 0,  # temp try
                "initialization": "Furthest",
                "destination_key": "junk.hex",
                # we get NaNs if whole col is NA
                "ignored_cols": "C1",
                "normalize": 0,
                # reuse the same seed, to get deterministic results
                "seed": 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs["initialization"] = "PlusPlus"
            elif (trial % 3) == 1:
                kwargs["initialization"] = "Furthest"
            else:
                kwargs["initialization"] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(
                parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs
            )
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )
            print "kmeans result:", h2o.dump_json(kmeansResult)

            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}".format(
                len(h2o.nodes),
                h2o.nodes[0].java_heap_GB,
                "KMeans",
                "trial " + str(trial),
                csvFilename,
                elapsed,
                paramsString,
            )
            print l
            h2o.cloudPerfH2O.message(l)

            # his does predict
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeansResult, csvPathname, parseResult, "d", **kwargs
            )
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            # these clusters were sorted compared to the cluster order in training
            h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial)
            # why is the expected # of rows not right in KMeans2. That means predictions are wrong
            h2o_kmeans.compareResultsToExpected(
                self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial
            )

            # the tupleResultList has the size during predict? compare it to the sizes during training
            # I assume they're in the same order.
            model = kmeansResult["model"]
            size = model["size"]
            size2 = [t[1] for t in tupleResultList]

            if 1 == 1:  # debug
                print "training size:", size
                print "predict size2:", size2
                print "training sorted(size):", sorted(size)
                print "predict sorted(size2):", sorted(size2)
                print h2o.nodes[0].http_addr
                print h2o.nodes[0].port

            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            print "iterations", iterations

            if iterations >= (max_iter - 1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (trial, iterations, max_iter),
                )

            # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure

            # can't do this compare, because size2 is sorted by center order..
            # so we don't know how to reorder size the same way
            # we could just sort the two of them, for some bit of comparison.
            if sorted(size) != sorted(size2):
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as predict on same data: %s"
                    % (trial, size, size2)
                )

            # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram
            expectedSize = [t[1] / SCALE_SIZE for t in expected]

            if size2 != expectedSize:
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as expected: %s" % (trial, size, expectedSize)
                )

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
    def test_c5_KMeans_sphere_26GB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        csvFilename = 'syn_sphere15_gen_26GB.csv'
        # csvFilename = 'syn_sphere_gen_h1m.csv'
        # csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        # csvFilename = 'syn_sphere_gen_h1m_no_na.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        if NA_COL_BUG:
            expected = [
                # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB,
                # so shouldn't be used for 26GB
                # or it should be divided by 7
                # the distribution is the same, obviously.
                ([
                    -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                    31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                    -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    19.00092954923767, -10.999565572612255, 90.00028669073289,
                    1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                    30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                    16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                    -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                    11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                    -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    147.00394564757505, 122.98729664236723, 311.0047920137008,
                    2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]
        else:
            expected = [
                ([
                    0.0, -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    0.0, 5.0, 2.0, 340.0, 1817.995920197288,
                    33970406.992053084, 31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    0.0, 10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    0.0, 12.0, 3.0, 168.0, -4066.995950679284,
                    41077063.00269915, -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    0.0, 19.00092954923767, -10.999565572612255,
                    90.00028669073289, 1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    0.0, 20.0, 0.0, 141.0, -3263.0030236302937,
                    6163210.990273981, 30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    0.0, 21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    0.0, 39.0, 3.0, 470.0, -3337.9880599007597,
                    28768057.98852736, 16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    0.0, 40.0, 1.0, 145.0, 950.9990795199593,
                    14602680.991458317, -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    0.0, 42.0, 4.0, 479.0, -3678.0033024834297,
                    8209673.001421165, 11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    0.0, 48.0, 4.0, 71.0, -951.0035145455234,
                    49882273.00063991, -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    0.0, 147.00394564757505, 122.98729664236723,
                    311.0047920137008, 2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='hdfs',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='local',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'],
                                         timeoutSecs=300)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summary = h2o_cmd.runSummary(key=parseResult['destination_key'],
                                         numRows=numRows,
                                         numCols=numCols,
                                         timeoutSecs=300)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15,
                'max_iter': 500,
                # 'normalize': 1,
                'normalize': 0,  # temp try
                'initialization': 'Furthest',
                'destination_key': 'junk.hex',
                # we get NaNs if whole col is NA
                'ignored_cols': 'C1',
                'normalize': 0,
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial % 3) == 1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             benchmarkLogging=benchmarkLogging,
                                             **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeansResult)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans",
                "trial " + str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            # his does predict
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeansResult, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            # these clusters were sorted compared to the cluster order in training
            h2o_kmeans.showClusterDistribution(self,
                                               tupleResultList,
                                               expected,
                                               trial=trial)
            # why is the expected # of rows not right in KMeans2. That means predictions are wrong
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                allowError=False,
                                                allowRowError=True,
                                                trial=trial)

            # the tupleResultList has the size during predict? compare it to the sizes during training
            # I assume they're in the same order.
            model = kmeansResult['model']
            size = model['size']
            size2 = [t[1] for t in tupleResultList]

            if 1 == 1:  # debug
                print "training size:", size
                print "predict size2:", size2
                print "training sorted(size):", sorted(size)
                print "predict sorted(size2):", sorted(size2)
                print h2o.nodes[0].http_addr
                print h2o.nodes[0].port

            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            print "iterations", iterations

            if iterations >= (
                    max_iter -
                    1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (trial, iterations, max_iter))

            # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure

            # can't do this compare, because size2 is sorted by center order..
            # so we don't know how to reorder size the same way
            # we could just sort the two of them, for some bit of comparison.
            if sorted(size) != sorted(size2):
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as predict on same data: %s"
                    % (trial, size, size2))

            # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram
            expectedSize = [t[1] / SCALE_SIZE for t in expected]

            if size2 != expectedSize:
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as expected: %s"
                    % (trial, size, expectedSize))

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
示例#4
0
    def sub_c2_nongz_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600),
        ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                # double import still causing problems?
                # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                # importFullList = importResult['files']
                # importFailList = importResult['fails']
                # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # remove the output too! (378)
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'family': 'binomial',
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    # convert to binomial
                    execExpr="A.hex=%s" % parseResult['destination_key']
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
示例#5
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    ignore_x = []
                    for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]:
                        x.remove(i)
                        ignore_x.append(i)
                    x.remove(378)

                    # add one since we are no longer 0 based offset
                    x = ",".join(map(lambda x: "C" + str(x+1), x))
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    execExpr="A.hex=%s" % parseResult['destination_key']
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
示例#6
0
    def sub_c3_fvec_long(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 116561140
        bucket = "home-0xdiag-datasets"
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = "manyfiles-nflx-gz"
        print "Using .gz'ed files in", importFolderPath
        if len(h2o.nodes) == 1:
            csvFilenameList = [("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600)]
        else:
            csvFilenameList = [
                ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800),
                # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 1800),
            ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ["cpu", "disk", "network"]
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            importFullList = importResult["files"]
            importFailList = importResult["fails"]
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(
                bucket=bucket,
                path=csvPathname,
                schema="local",
                hex_key=csvFilename + ".hex",
                timeoutSecs=timeoutSecs,
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs,
                benchmarkLogging=benchmarkLogging,
            )
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            print "Parse result['destination_key']:", parseResult["destination_key"]
            h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # these are all the columns that are enums in the dataset...too many for GLM!
                x = range(542)  # don't include the output column
                # remove the output too! (378)
                ignore_x = []
                for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541]:
                    x.remove(i)
                    ignore_x.append(i)
                x.remove(378)

                # add one since we are no longer 0 based offset
                x = ",".join(map(lambda x: "C" + str(x + 1), x))
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    "ignored_cols": ignore_x,
                    "response": "C379",
                    "max_iter": 4,
                    "n_folds": 1,
                    "family": "binomial",
                    "alpha": 0.2,
                    "lambda": 1e-5,
                }

                # convert to binomial
                execExpr = "A.hex=%s" % parseResult["destination_key"]
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ("C379", "C379", 15)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                aHack = {"destination_key": "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = "{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
示例#7
0
    def sub_c2_rel_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 116561140 
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx-gz'
        print "Using .gz'ed files in", importFolderPath
        if len(h2o.nodes)==1:
            csvFilenameList= [
                ("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600),
            ]
        else:
            csvFilenameList= [
                ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800),
                # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    ignore_x = []
                    # for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]:
                    for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]:
                        x.remove(i)
                        ignore_x.append(i)

                    # increment by one, because we are no long zero offset!
                    x = ",".join(map(lambda x: "C" + str(x+1), x))
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'family': 'binomial',
                        'x': x,
                        'y': 'C379', 
                        'case': 15, 
                        'case_mode': '>',
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
示例#8
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = "home-0xdiag-datasets"
        importFolderPath = "manyfiles-nflx"
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ["cpu", "disk", "network"]
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            if DO_DOUBLE_IMPORT:
                (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
                importFullList = importResult["files"]
                importFailList = importResult["fails"]
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(
                bucket=bucket,
                path=csvPathname,
                schema="local",
                hex_key="A.hex",
                timeoutSecs=timeoutSecs,
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs,
                benchmarkLogging=benchmarkLogging,
            )
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            print "Parse result['destination_key']:", parseResult["destination_key"]
            h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # remove the output too! (378)
                ignore_x = [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541]
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    "ignored_cols": ignore_x,
                    "response": "C379",
                    "max_iter": 10,
                    "n_folds": 1,
                    "family": "binomial",
                    "alpha": 0.2,
                    "lambda": 1e-5,
                }

                # convert to binomial
                # execExpr="A.hex=%s" % parseResult['destination_key']
                # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                execExpr = "A.hex[,378+1]=(A.hex[,378+1]>15)"
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                aHack = {"destination_key": "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = "{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
    def test_KMeans_sphere15_180GB(self):
        # a kludge
        h2o.setup_benchmark_log()

        csvFilename = "syn_sphere15_2711545732row_6col_180GB_from_7x.csv"
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        # removed 0's from first col because we set "ignored_cols"
        expected = [
            (
                [0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                248846122,
                1308149283316.2988,
            ),
            (
                [0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                276924291,
                1800760152555.98,
            ),
            (
                [0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                235089554,
                375419158808.3253,
            ),
            (
                [0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                166180630,
                525423632323.6474,
            ),
            (
                [0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                167234179,
                1845362026223.1094,
            ),
            (
                [0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                195420925,
                197941282992.43475,
            ),
            (
                [0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                214401768,
                11868360232.658035,
            ),
            (
                [0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                258853406,
                598863991074.3276,
            ),
            (
                [0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                190979054,
                1505088759456.314,
            ),
            (
                [0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                87794427,
                1124697008162.3955,
            ),
            (
                [0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                78226988,
                1151439441529.0215,
            ),
            (
                [0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                167273589,
                693036940951.0249,
            ),
            (
                [0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                148426180,
                35942838893.32379,
            ),
            (
                [0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                157533313,
                88431531357.62982,
            ),
            (
                [0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                118361306,
                1111537045743.7646,
            ),
        ]

        benchmarkLogging = ["cpu", "disk", "network", "iostats", "jstack"]
        benchmarkLogging = ["cpu", "disk", "network", "iostats"]
        # IOStatus can hang?
        benchmarkLogging = ["cpu", "disk", "network"]
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="hdfs",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs
                )
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="local",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs
                )

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "Parse", csvPathname, fileMBS, elapsed
            )
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                "max_iter": 30,
                "k": 15,
                "initialization": "Furthest",
                "cols": None,
                "destination_key": "junk.hex",
                # reuse the same seed, to get deterministic results
                "seed": 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs["initialization"] = "PlusPlus"
            elif (trial % 3) == 1:
                kwargs["initialization"] = "Furthest"
            else:
                kwargs["initialization"] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeans(
                parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs
            )
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )

            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}".format(
                len(h2o.nodes),
                h2o.nodes[0].java_heap_GB,
                "KMeans",
                "trial " + str(trial),
                csvFilename,
                elapsed,
                paramsString,
            )
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseResult, "d", **kwargs
            )
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(
                self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial
            )
            h2i.delete_keys_at_all_nodes()
示例#10
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # output 378 can't be in this
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 10, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    # execExpr="A.hex=%s" % parseResult['destination_key']
                    # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
示例#11
0
    def test_c6_hdfs(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle",
            "TEST-poker1000.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            # "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            # has duplicated col name
            # "hhp2.os.noisy.0_1.data",
            # "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]
        csvFilenameAll = [
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
            "covtype4x.shuffle.data",
        ]

        # find_cloud.py won't set these correctly. Let's just set them here
        # update: look in the runner*sh ..the find_cloud.py has args now to get it right
        # we have two cdh's though. I guess we're going to use whatever got setup
        # h2o.nodes[0].use_maprfs = False
        # h2o.nodes[0].use_hdfs = True
        # h2o.nodes[0].hdfs_version = 'hdp2.1'
        # h2o.nodes[0].hdfs_name_node = '172.16.2.186'

        h2o.setup_benchmark_log()

        # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        # benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        # pick 8 randomly!
        if DO_RANDOM_SAMPLE:
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # save the first, for all comparisions, to avoid slow drift with each iteration
        importFolderPath = "datasets"
        trial = 0
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            csvPathname = importFolderPath + "/" + csvFilename

            timeoutSecs = 1000
            # do an import first, because we want to get the size of the file
            print "Loading", csvFilename, 'from hdfs'
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", timeoutSecs=timeoutSecs,
                doSummary=True, benchmarkLogging=benchmarkLogging)
            print "parse result:", parseResult['destination_key']

            elapsed = time.time() - start
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            if DO_RF:
                print "\n" + csvFilename
                start = time.time()
                kwargs = {
                    'ntree': 1
                    }
                paramsString = json.dumps(kwargs)
                RFview = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=2000,
                    benchmarkLogging=benchmarkLogging, **kwargs)
                elapsed = time.time() - start
                print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

                l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF", "trial "+str(trial), csvFilename, elapsed, paramsString)
                print l
                h2o.cloudPerfH2O.message(l)

            if 1==0:
                print "Deleting all keys, to make sure our parse times don't include spills"
                h2i.delete_keys_at_all_nodes()

            trial += 1
示例#12
0
    def test_c6_maprfs_fvec(self):
        h2o.beta_features = True
        print "\nLoad a list of files from maprfs, parse and do 1 RF tree"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle",
            "TEST-poker1000.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            # "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            # duplicate column header "A"
            # "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]

        # find_cloud.py won't set these correctly. Let's just set them here
        # h2o.nodes[0].use_maprfs = True
        # h2o.nodes[0].use_hdfs = False
        # h2o.nodes[0].hdfs_version = 'mapr3.0.1',
        # h2o.nodes[0].hdfs_name_node = '172.16.2.171:7222'

        h2o.setup_benchmark_log()

        # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        # benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        # pick 8 randomly!
        if DO_RANDOM_SAMPLE:
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # save the first, for all comparisions, to avoid slow drift with each iteration
        importFolderPath = "datasets"
        trial = 0
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            csvPathname = importFolderPath + "/" + csvFilename

            timeoutSecs = 1000
            # do an import first, because we want to get the size of the file
            (importResult,
             importPattern) = h2i.import_only(path=csvPathname,
                                              schema="maprfs",
                                              timeoutSecs=timeoutSecs)
            print "importResult:", h2o.dump_json(importResult)
            succeeded = importResult['files']
            fails = importResult['fails']

            if len(succeeded) < 1:
                raise Exception("Should have imported at least 1 key for %s" %
                                csvPathname)

            # just do a search
            foundIt = None
            for f in succeeded:
                if csvPathname in f:
                    foundIt = f
                    break

            if not foundIt:
                raise Exception(
                    "Should have found %s in the imported keys for %s" %
                    (importPattern, csvPathname))

            totalBytes = 0

            print "Loading", csvFilename, 'from maprfs'
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema="maprfs",
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=360,
                                           doSummary=True,
                                           benchmarkLogging=benchmarkLogging,
                                           noPoll=h2o.beta_features)
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
            print "parse result:", parseResult['destination_key']

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            if DO_RF:
                print "\n" + csvFilename
                start = time.time()
                kwargs = {'ntrees': 1}
                paramsString = json.dumps(kwargs)
                RFview = h2o_cmd.runRF(parseResult=parseResult,
                                       timeoutSecs=2000,
                                       benchmarkLogging=benchmarkLogging,
                                       noPoll=h2o.beta_features,
                                       **kwargs)
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                    (elapsed / timeoutSecs) * 100)

                l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF",
                    "trial " + str(trial), csvFilename, elapsed, paramsString)
                print l
                h2o.cloudPerfH2O.message(l)

            print "Deleting all keys, to make sure our parse times don't include spills"
            h2i.delete_keys_at_all_nodes()
            trial += 1
示例#13
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)
                h2o_cmd.checkKeyDistribution()

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                execExpr = 'B.hex=A.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'C.hex=B.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'D.hex=C.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()
示例#14
0
    def sub_c2_nongz_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600),
        ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            # double import still causing problems?
            # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            # importFullList = importResult['files']
            # importFailList = importResult['fails']
            # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename +
                                     " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=csvFilename + ".hex",
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=retryDelaySecs,
                                           pollTimeoutSecs=pollTimeoutSecs,
                                           benchmarkLogging=benchmarkLogging)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # these are all the columns that are enums in the dataset...too many for GLM!
                x = range(542)  # don't include the output column
                # remove the output too! (378)
                ignore_x = []
                for i in [
                        3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20,
                        424, 425, 426, 540, 541
                ]:
                    x.remove(i)
                    ignore_x.append(i)

                # plus 1 because we are no longer 0 offset
                x = ",".join(map(lambda x: "C" + str(x + 1), x))
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    'ignored_cols': ignore_x,
                    'family': 'binomial',
                    'response': 'C379',
                    'max_iter': 4,
                    'n_folds': 1,
                    'family': 'binomial',
                    'alpha': 0.2,
                    'lambda': 1e-5
                }

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                # convert to binomial
                execExpr = "A.hex=%s" % parseResult['destination_key']
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                aHack = {'destination_key': "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
示例#15
0
    def test_c6_maprfs_fvec(self):
        h2o.beta_features = True
        print "\nLoad a list of files from maprfs, parse and do 1 RF tree"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle",
            "TEST-poker1000.csv",
            "and-testing.data",
            "arcene2_train.both",
            "arcene_train.both",
            "bestbuy_test.csv",
            "bestbuy_train.csv",
            # "billion_rows.csv.gz",
            "covtype.13x.data",
            "covtype.13x.shuffle.data",
            # "covtype.169x.data",
            "covtype.4x.shuffle.data",
            "covtype.data",
            "covtype4x.shuffle.data",
            "hhp.unbalanced.012.1x11.data.gz",
            "hhp.unbalanced.012.data.gz",
            "hhp.unbalanced.data.gz",
            # duplicate column header "A"
            # "hhp2.os.noisy.0_1.data",
            "hhp2.os.noisy.9_4.data",
            "hhp_9_14_12.data",
            "leads.csv",
            "prostate_long_1G.csv",
        ]

        # find_cloud.py won't set these correctly. Let's just set them here
        # h2o.nodes[0].use_maprfs = True
        # h2o.nodes[0].use_hdfs = False
        # h2o.nodes[0].hdfs_version = 'mapr3.0.1',
        # h2o.nodes[0].hdfs_name_node = '192.168.1.171:7222'

        h2o.setup_benchmark_log()

        # benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        # benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        # pick 8 randomly!
        if DO_RANDOM_SAMPLE:
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # save the first, for all comparisions, to avoid slow drift with each iteration
        importFolderPath = "datasets"
        trial = 0
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            csvPathname = importFolderPath + "/" + csvFilename

            timeoutSecs = 1000
            # do an import first, because we want to get the size of the file
            (importResult, importPattern) = h2i.import_only(path=csvPathname, schema="maprfs", timeoutSecs=timeoutSecs)
            print "importResult:", h2o.dump_json(importResult)
            succeeded = importResult['files']
            fails = importResult['fails']

            if len(succeeded) < 1:
                raise Exception("Should have imported at least 1 key for %s" % csvPathname)

            # just do a search
            foundIt = None
            for f in succeeded:
                if csvPathname in f:
                    foundIt = f
                    break

            if not foundIt:
                raise Exception("Should have found %s in the imported keys for %s" % (importPattern, csvPathname))

            totalBytes = 0

            print "Loading", csvFilename, 'from maprfs'
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema="maprfs", timeoutSecs=timeoutSecs, pollTimeoutSecs=360,
                doSummary=True, benchmarkLogging=benchmarkLogging, noPoll=h2o.beta_features)
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
            print "parse result:", parseResult['destination_key']

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            if DO_RF:
                print "\n" + csvFilename
                start = time.time()
                kwargs = {
                    'ntrees': 1
                    }
                paramsString = json.dumps(kwargs)
                RFview = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=2000,
                    benchmarkLogging=benchmarkLogging, noPoll=h2o.beta_features, **kwargs)
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

                l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, "RF", "trial "+str(trial), csvFilename, elapsed, paramsString)
                print l
                h2o.cloudPerfH2O.message(l)

            print "Deleting all keys, to make sure our parse times don't include spills"
            h2i.delete_keys_at_all_nodes()
            trial += 1
示例#16
0
    def test_c5_KMeans_sphere_67MB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        csvFilename = 'syn_sphere_gen_h1m_no_na.csv'
        totalBytes = 67306997
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            else:
                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # clear out all NAs (walk across cols)..clear to 0
            # temp
            ## execExpr = '%s=apply(%s,2,function(x){ifelse(is.na(x),0,x)})' % (hex_key, hex_key)
            ## h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            summary = h2o_cmd.runSummary(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15, 
                'max_iter': 10,
                'normalize': 1,
                'initialization': 'Furthest',
                'destination_key': 'junk.hex', 
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
                # 'ignored_cols': 'C0', # get NaNs if col with all NAs is left in. the exec2 clear doesn't seem to work
                }

            if (trial%3)==0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial%3)==1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeans)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
            h2i.delete_keys_at_all_nodes()
示例#17
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)
                h2o_cmd.checkKeyDistribution()

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                execExpr = 'B.hex=A.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'C.hex=B.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'D.hex=C.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()