예제 #1
0
    def test_KMeans_covtype_fvec(self):
        csvFilenameList = [
            ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            for trial in range(2):
                kwargs = {
                    'k': 6,
                    'initialization': 'Furthest',
                    # 'initialization': '',
                    # 'ignored_cols': range(11, inspect['numCols']),
                    # ignore the response
                    'ignored_cols_by_name': 'C55',
                    'max_iter': 100,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results
                    'seed': 265211114317615310
                }

                start = time.time()
                kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

                expected = [
                    ([2781.64184460309, 162.69950733599902, 16.545275983574268, 243.73547234768156, 50.48239522121315, 942.4480922085701, 208.3915356763203, 218.7135425941215, 140.10956243018794, 1040.6795741397266, 0.22024185323685105, 0.0845245225799837, 0.4957505706376572, 0.19948305354550802, 0.01635558145683929, 0.033196811983660604, 0.026025394050259283, 0.04566180477986607, 0.008617572941792261, 0.03547936261257615, 0.0, 0.0, 0.006189327591882107, 0.13606268110663236, 0.037222303163733886, 0.024007252359445064, 0.040891651692487006, 0.003232264365769295, 1.6188302332734367e-05, 0.004667627172605076, 0.00910861811255187, 9.173371321882807e-05, 0.0025415634662392956, 0.008946735089224526, 0.0023095311328034363, 0.04957397784361021, 0.09252154393235448, 0.03887890610245037, 0.0, 0.0, 0.0010792201555156243, 0.004867282901375466, 0.08281935473426902, 0.045640220376755754, 0.04933654940939677, 0.08426550974265995, 0.07772003949945769, 0.001327440791284218, 0.0014191745045030462, 0.0, 0.0, 0.009513325670870229, 0.010970272880816322, 0.009443176360761713], 185319, 116283720155.37769) ,

                    ([2892.8730376693256, 119.94759695676377, 11.22516236778623, 189.0301354611245, 24.621525329374652, 2631.9842642419744, 219.94967526442753, 223.3794395991835, 135.71226572647987, 5409.1797365002785, 0.883243644460939, 0.11675635553906105, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0015587307478196325, 0.0, 0.0, 0.0, 0.23410651326776769, 0.0, 0.0, 0.0, 0.026498422712933754, 0.0, 0.04152904063833735, 0.005158656522545927, 0.0695490814622379, 0.0, 0.0634997216552236, 0.05418444980515866, 0.010391538318797551, 0.0002969010948227871, 0.0, 0.0, 0.0, 0.3677862312117276, 0.07596956763778066, 0.0, 0.01109667841900167, 0.005641120801632956, 0.0, 0.0018185192057895714, 0.0, 0.0, 0.0021154203006123586, 0.018444980515865652, 0.010354425681944703], 26945, 46932273891.61873) ,

                    ([3022.020861415003, 137.8546989122598, 13.3449108178427, 282.99227296949937, 45.23691263596753, 1606.0215197015768, 216.64941537882825, 222.64791856054669, 137.40339644525253, 2529.4366555907336, 0.4113429046111407, 0.08617284724616782, 0.5024842481426914, 0.0, 0.0, 0.0052506191028494405, 0.0, 0.014176671577693489, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018949249239835743, 0.029850161436945546, 0.05403435628977148, 0.020892761982382997, 0.0, 0.0, 0.0018494718033917432, 0.011731607159650168, 0.005979436381304661, 0.0047098837027052445, 0.013714303626845553, 0.0007601642581737249, 0.047788470580859534, 0.10631328171530674, 0.04641704021817498, 0.0036519231372057308, 0.011872668568383437, 0.0, 0.00034481677690354536, 0.17267483777937995, 0.044473527475627724, 0.05637754302372967, 0.1292435973793925, 0.11970627880003762, 0.0013871038525438075, 0.004858781856368139, 0.0, 0.0, 0.03151155136202627, 0.028988119494686687, 0.012491771417823892], 127604, 95229063588.02844) ,

                    ([3051.365089986695, 168.1268450579292, 14.114846831985933, 287.6101588092033, 50.702549817536706, 2835.266162979793, 209.89460702308608, 226.92302305495684, 148.84282479633362, 1461.8985753079312, 0.3284728328107128, 0.0006069141527711857, 0.670920253036516, 0.0, 0.0, 0.0054700083256172235, 0.0, 0.01653452018767653, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03886584862938554, 0.013250959002170886, 0.04277966681969203, 0.05480901656564399, 0.0, 0.0, 0.0010426473906581905, 0.0018440853103432178, 0.0, 0.0035014278044491476, 0.011671426014830491, 0.002435437561761296, 0.044405885511091744, 0.10662236712081483, 0.042756323967662366, 0.0, 0.007384122192049426, 0.006263665294625696, 0.0, 0.14390868276285998, 0.022152366576148275, 0.07071327974851968, 0.14799368186805065, 0.1011367968938445, 0.009111493242244337, 0.006427065258833325, 0.0009259331305098857, 0.002318723301612991, 0.03055579330682623, 0.041044514818820564, 0.024074261393257027], 128519, 106432862495.53804) ,

                    ([3052.088693852026, 149.15056174929376, 11.549996765359152, 328.4748452763461, 44.2420589567205, 4786.68757682272, 215.8348392383499, 226.91413106764713, 143.9780260065124, 4192.589071226791, 0.8949819938326181, 0.0, 0.10501800616738188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0022642485929312314, 0.002415198499126647, 0.0, 0.00012938563388178466, 0.0, 0.1351648588618377, 0.0, 0.0, 0.0, 0.014836219351777974, 0.0, 0.0, 0.010674314795247235, 0.03553792077286352, 0.0, 0.039290104155435275, 0.09289888512712138, 0.03864317598602636, 0.0, 0.0, 0.0, 0.0, 0.4371509283419232, 0.08636491061609126, 0.0003665926293317232, 0.002717098311517478, 0.017100467944709204, 0.0, 0.0028249196730856323, 0.0, 0.0, 0.03226015138119164, 0.017316110667845514, 0.03204450865805533], 46373, 77991941653.19676) ,

                    ([3119.4885286481917, 165.13178470083923, 11.672206122079334, 271.2690333876713, 39.407851838435064, 4959.81440560285, 212.5861709835175, 227.95909557447322, 148.6725381875264, 1613.4457676749382, 0.9052556903942522, 0.0, 0.09474430960574776, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00037734709895550323, 0.0, 0.0, 0.0, 0.008346917828895732, 0.0021584254060254783, 0.0, 0.0, 0.0031395278633097865, 0.0, 0.0, 0.02815009358208054, 0.012512829801364487, 0.0, 0.13355068526233171, 0.11424560767976816, 0.008799734347642335, 0.0, 0.0018867354947775161, 0.0012226046006158305, 0.0, 0.44056028497252914, 0.10774014369377528, 0.0033810300066413087, 0.014580691903640641, 0.02313892410795146, 0.0002565960272897422, 3.018776791644026e-05, 0.0, 0.0, 0.06503954597597053, 0.022625732053371973, 0.008256354525146411], 66252, 74666940350.2879) ,
                ]


                ### print h2o.dump_json(kmeans)
                predictKey = 'd'
                (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeansResult, csvPathname, parseResult, predictKey, **kwargs)
                # all are multipliers of expected tuple value
                allowedDelta = (0.01, 0.01, 0.01)
                # these clusters were sorted compared to the cluster order in training
                h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial)
                # why is the expected # of rows not right in KMeans2. That means predictions are wrong
                h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=False,
                    allowRowError=True, trial=trial)

                print "Trial #", trial, "completed\n"
    def test_c5_KMeans_sphere_26GB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        csvFilename = 'syn_sphere15_gen_26GB.csv'
        # csvFilename = 'syn_sphere_gen_h1m.csv'
        # csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        # csvFilename = 'syn_sphere_gen_h1m_no_na.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        if NA_COL_BUG:
            expected = [
                # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB,
                # so shouldn't be used for 26GB
                # or it should be divided by 7
                # the distribution is the same, obviously.
                ([
                    -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                    31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                    -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    19.00092954923767, -10.999565572612255, 90.00028669073289,
                    1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                    30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                    16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                    -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                    11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                    -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    147.00394564757505, 122.98729664236723, 311.0047920137008,
                    2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]
        else:
            expected = [
                ([
                    0.0, -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    0.0, 5.0, 2.0, 340.0, 1817.995920197288,
                    33970406.992053084, 31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    0.0, 10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    0.0, 12.0, 3.0, 168.0, -4066.995950679284,
                    41077063.00269915, -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    0.0, 19.00092954923767, -10.999565572612255,
                    90.00028669073289, 1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    0.0, 20.0, 0.0, 141.0, -3263.0030236302937,
                    6163210.990273981, 30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    0.0, 21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    0.0, 39.0, 3.0, 470.0, -3337.9880599007597,
                    28768057.98852736, 16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    0.0, 40.0, 1.0, 145.0, 950.9990795199593,
                    14602680.991458317, -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    0.0, 42.0, 4.0, 479.0, -3678.0033024834297,
                    8209673.001421165, 11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    0.0, 48.0, 4.0, 71.0, -951.0035145455234,
                    49882273.00063991, -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    0.0, 147.00394564757505, 122.98729664236723,
                    311.0047920137008, 2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='hdfs',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='local',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'],
                                         timeoutSecs=300)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summary = h2o_cmd.runSummary(key=parseResult['destination_key'],
                                         numRows=numRows,
                                         numCols=numCols,
                                         timeoutSecs=300)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15,
                'max_iter': 500,
                # 'normalize': 1,
                'normalize': 0,  # temp try
                'initialization': 'Furthest',
                'destination_key': 'junk.hex',
                # we get NaNs if whole col is NA
                'ignored_cols': 'C1',
                'normalize': 0,
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial % 3) == 1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             benchmarkLogging=benchmarkLogging,
                                             **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeansResult)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans",
                "trial " + str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            # his does predict
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeansResult, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            # these clusters were sorted compared to the cluster order in training
            h2o_kmeans.showClusterDistribution(self,
                                               tupleResultList,
                                               expected,
                                               trial=trial)
            # why is the expected # of rows not right in KMeans2. That means predictions are wrong
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                allowError=False,
                                                allowRowError=True,
                                                trial=trial)

            # the tupleResultList has the size during predict? compare it to the sizes during training
            # I assume they're in the same order.
            model = kmeansResult['model']
            size = model['size']
            size2 = [t[1] for t in tupleResultList]

            if 1 == 1:  # debug
                print "training size:", size
                print "predict size2:", size2
                print "training sorted(size):", sorted(size)
                print "predict sorted(size2):", sorted(size2)
                print h2o.nodes[0].http_addr
                print h2o.nodes[0].port

            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            print "iterations", iterations

            if iterations >= (
                    max_iter -
                    1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (trial, iterations, max_iter))

            # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure

            # can't do this compare, because size2 is sorted by center order..
            # so we don't know how to reorder size the same way
            # we could just sort the two of them, for some bit of comparison.
            if sorted(size) != sorted(size2):
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as predict on same data: %s"
                    % (trial, size, size2))

            # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram
            expectedSize = [t[1] / SCALE_SIZE for t in expected]

            if size2 != expectedSize:
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as expected: %s"
                    % (trial, size, expectedSize))

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
    def test_c5_KMeans_sphere_26GB_fvec(self):
        # a kludge
        h2o.setup_benchmark_log()

        # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        csvFilename = "syn_sphere15_gen_26GB.csv"
        # csvFilename = 'syn_sphere_gen_h1m.csv'
        # csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        # csvFilename = 'syn_sphere_gen_h1m_no_na.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        if NA_COL_BUG:
            expected = [
                # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB,
                # so shouldn't be used for 26GB
                # or it should be divided by 7
                # the distribution is the same, obviously.
                (
                    [-113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                    248846122,
                    1308149283316.2988,
                ),
                (
                    [1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                    276924291,
                    1800760152555.98,
                ),
                (
                    [5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                    235089554,
                    375419158808.3253,
                ),
                (
                    [10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                    166180630,
                    525423632323.6474,
                ),
                (
                    [11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                    167234179,
                    1845362026223.1094,
                ),
                (
                    [12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                    195420925,
                    197941282992.43475,
                ),
                (
                    [19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                    214401768,
                    11868360232.658035,
                ),
                (
                    [20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                    258853406,
                    598863991074.3276,
                ),
                (
                    [21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                    190979054,
                    1505088759456.314,
                ),
                (
                    [25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                    87794427,
                    1124697008162.3955,
                ),
                (
                    [39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                    78226988,
                    1151439441529.0215,
                ),
                (
                    [40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                    167273589,
                    693036940951.0249,
                ),
                (
                    [42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                    148426180,
                    35942838893.32379,
                ),
                (
                    [48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                    157533313,
                    88431531357.62982,
                ),
                (
                    [147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                    118361306,
                    1111537045743.7646,
                ),
            ]
        else:
            expected = [
                (
                    [0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                    248846122,
                    1308149283316.2988,
                ),
                (
                    [0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                    276924291,
                    1800760152555.98,
                ),
                (
                    [0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                    235089554,
                    375419158808.3253,
                ),
                (
                    [0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                    166180630,
                    525423632323.6474,
                ),
                (
                    [0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                    167234179,
                    1845362026223.1094,
                ),
                (
                    [0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                    195420925,
                    197941282992.43475,
                ),
                (
                    [0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                    214401768,
                    11868360232.658035,
                ),
                (
                    [0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                    258853406,
                    598863991074.3276,
                ),
                (
                    [0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                    190979054,
                    1505088759456.314,
                ),
                (
                    [0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                    87794427,
                    1124697008162.3955,
                ),
                (
                    [0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                    78226988,
                    1151439441529.0215,
                ),
                (
                    [0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                    167273589,
                    693036940951.0249,
                ),
                (
                    [0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                    148426180,
                    35942838893.32379,
                ),
                (
                    [0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                    157533313,
                    88431531357.62982,
                ),
                (
                    [0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                    118361306,
                    1111537045743.7646,
                ),
            ]

        benchmarkLogging = ["cpu", "disk", "network", "iostats", "jstack"]
        benchmarkLogging = ["cpu", "disk", "network", "iostats"]
        # IOStatus can hang?
        benchmarkLogging = ["cpu", "disk", "network"]
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="hdfs",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs
                )
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="local",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs
                )

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "Parse", csvPathname, fileMBS, elapsed
            )
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], timeoutSecs=300)
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summary = h2o_cmd.runSummary(
                key=parseResult["destination_key"], numRows=numRows, numCols=numCols, timeoutSecs=300
            )
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                "k": 15,
                "max_iter": 500,
                # 'normalize': 1,
                "normalize": 0,  # temp try
                "initialization": "Furthest",
                "destination_key": "junk.hex",
                # we get NaNs if whole col is NA
                "ignored_cols": "C1",
                "normalize": 0,
                # reuse the same seed, to get deterministic results
                "seed": 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs["initialization"] = "PlusPlus"
            elif (trial % 3) == 1:
                kwargs["initialization"] = "Furthest"
            else:
                kwargs["initialization"] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(
                parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs
            )
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )
            print "kmeans result:", h2o.dump_json(kmeansResult)

            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}".format(
                len(h2o.nodes),
                h2o.nodes[0].java_heap_GB,
                "KMeans",
                "trial " + str(trial),
                csvFilename,
                elapsed,
                paramsString,
            )
            print l
            h2o.cloudPerfH2O.message(l)

            # his does predict
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeansResult, csvPathname, parseResult, "d", **kwargs
            )
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            # these clusters were sorted compared to the cluster order in training
            h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial)
            # why is the expected # of rows not right in KMeans2. That means predictions are wrong
            h2o_kmeans.compareResultsToExpected(
                self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial
            )

            # the tupleResultList has the size during predict? compare it to the sizes during training
            # I assume they're in the same order.
            model = kmeansResult["model"]
            size = model["size"]
            size2 = [t[1] for t in tupleResultList]

            if 1 == 1:  # debug
                print "training size:", size
                print "predict size2:", size2
                print "training sorted(size):", sorted(size)
                print "predict sorted(size2):", sorted(size2)
                print h2o.nodes[0].http_addr
                print h2o.nodes[0].port

            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            print "iterations", iterations

            if iterations >= (max_iter - 1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (trial, iterations, max_iter),
                )

            # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure

            # can't do this compare, because size2 is sorted by center order..
            # so we don't know how to reorder size the same way
            # we could just sort the two of them, for some bit of comparison.
            if sorted(size) != sorted(size2):
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as predict on same data: %s"
                    % (trial, size, size2)
                )

            # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram
            expectedSize = [t[1] / SCALE_SIZE for t in expected]

            if size2 != expectedSize:
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as expected: %s" % (trial, size, expectedSize)
                )

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
예제 #4
0
    def test_KMeans_covtype_fvec(self):
        csvFilenameList = [
            ('covtype.data', 800),
        ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            for trial in range(2):
                kwargs = {
                    'k': 6,
                    'initialization': 'Furthest',
                    # 'initialization': '',
                    # 'ignored_cols': range(11, inspect['numCols']),
                    # ignore the response
                    'ignored_cols_by_name': 'C55',
                    'max_iter': 100,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results
                    'seed': 265211114317615310
                }

                start = time.time()
                kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs)

                expected = [
                    ([
                        2781.64184460309, 162.69950733599902,
                        16.545275983574268, 243.73547234768156,
                        50.48239522121315, 942.4480922085701,
                        208.3915356763203, 218.7135425941215,
                        140.10956243018794, 1040.6795741397266,
                        0.22024185323685105, 0.0845245225799837,
                        0.4957505706376572, 0.19948305354550802,
                        0.01635558145683929, 0.033196811983660604,
                        0.026025394050259283, 0.04566180477986607,
                        0.008617572941792261, 0.03547936261257615, 0.0, 0.0,
                        0.006189327591882107, 0.13606268110663236,
                        0.037222303163733886, 0.024007252359445064,
                        0.040891651692487006, 0.003232264365769295,
                        1.6188302332734367e-05, 0.004667627172605076,
                        0.00910861811255187, 9.173371321882807e-05,
                        0.0025415634662392956, 0.008946735089224526,
                        0.0023095311328034363, 0.04957397784361021,
                        0.09252154393235448, 0.03887890610245037, 0.0, 0.0,
                        0.0010792201555156243, 0.004867282901375466,
                        0.08281935473426902, 0.045640220376755754,
                        0.04933654940939677, 0.08426550974265995,
                        0.07772003949945769, 0.001327440791284218,
                        0.0014191745045030462, 0.0, 0.0, 0.009513325670870229,
                        0.010970272880816322, 0.009443176360761713
                    ], 185319, 116283720155.37769),
                    ([
                        2892.8730376693256, 119.94759695676377,
                        11.22516236778623, 189.0301354611245,
                        24.621525329374652, 2631.9842642419744,
                        219.94967526442753, 223.3794395991835,
                        135.71226572647987, 5409.1797365002785,
                        0.883243644460939, 0.11675635553906105, 0.0, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0015587307478196325,
                        0.0, 0.0, 0.0, 0.23410651326776769, 0.0, 0.0, 0.0,
                        0.026498422712933754, 0.0, 0.04152904063833735,
                        0.005158656522545927, 0.0695490814622379, 0.0,
                        0.0634997216552236, 0.05418444980515866,
                        0.010391538318797551, 0.0002969010948227871, 0.0, 0.0,
                        0.0, 0.3677862312117276, 0.07596956763778066, 0.0,
                        0.01109667841900167, 0.005641120801632956, 0.0,
                        0.0018185192057895714, 0.0, 0.0, 0.0021154203006123586,
                        0.018444980515865652, 0.010354425681944703
                    ], 26945, 46932273891.61873),
                    ([
                        3022.020861415003, 137.8546989122598, 13.3449108178427,
                        282.99227296949937, 45.23691263596753,
                        1606.0215197015768, 216.64941537882825,
                        222.64791856054669, 137.40339644525253,
                        2529.4366555907336, 0.4113429046111407,
                        0.08617284724616782, 0.5024842481426914, 0.0, 0.0,
                        0.0052506191028494405, 0.0, 0.014176671577693489, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.018949249239835743,
                        0.029850161436945546, 0.05403435628977148,
                        0.020892761982382997, 0.0, 0.0, 0.0018494718033917432,
                        0.011731607159650168, 0.005979436381304661,
                        0.0047098837027052445, 0.013714303626845553,
                        0.0007601642581737249, 0.047788470580859534,
                        0.10631328171530674, 0.04641704021817498,
                        0.0036519231372057308, 0.011872668568383437, 0.0,
                        0.00034481677690354536, 0.17267483777937995,
                        0.044473527475627724, 0.05637754302372967,
                        0.1292435973793925, 0.11970627880003762,
                        0.0013871038525438075, 0.004858781856368139, 0.0, 0.0,
                        0.03151155136202627, 0.028988119494686687,
                        0.012491771417823892
                    ], 127604, 95229063588.02844),
                    ([
                        3051.365089986695, 168.1268450579292,
                        14.114846831985933, 287.6101588092033,
                        50.702549817536706, 2835.266162979793,
                        209.89460702308608, 226.92302305495684,
                        148.84282479633362, 1461.8985753079312,
                        0.3284728328107128, 0.0006069141527711857,
                        0.670920253036516, 0.0, 0.0, 0.0054700083256172235,
                        0.0, 0.01653452018767653, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.03886584862938554, 0.013250959002170886,
                        0.04277966681969203, 0.05480901656564399, 0.0, 0.0,
                        0.0010426473906581905, 0.0018440853103432178, 0.0,
                        0.0035014278044491476, 0.011671426014830491,
                        0.002435437561761296, 0.044405885511091744,
                        0.10662236712081483, 0.042756323967662366, 0.0,
                        0.007384122192049426, 0.006263665294625696, 0.0,
                        0.14390868276285998, 0.022152366576148275,
                        0.07071327974851968, 0.14799368186805065,
                        0.1011367968938445, 0.009111493242244337,
                        0.006427065258833325, 0.0009259331305098857,
                        0.002318723301612991, 0.03055579330682623,
                        0.041044514818820564, 0.024074261393257027
                    ], 128519, 106432862495.53804),
                    ([
                        3052.088693852026, 149.15056174929376,
                        11.549996765359152, 328.4748452763461,
                        44.2420589567205, 4786.68757682272, 215.8348392383499,
                        226.91413106764713, 143.9780260065124,
                        4192.589071226791, 0.8949819938326181, 0.0,
                        0.10501800616738188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                        0.0022642485929312314, 0.002415198499126647, 0.0,
                        0.00012938563388178466, 0.0, 0.1351648588618377, 0.0,
                        0.0, 0.0, 0.014836219351777974, 0.0, 0.0,
                        0.010674314795247235, 0.03553792077286352, 0.0,
                        0.039290104155435275, 0.09289888512712138,
                        0.03864317598602636, 0.0, 0.0, 0.0, 0.0,
                        0.4371509283419232, 0.08636491061609126,
                        0.0003665926293317232, 0.002717098311517478,
                        0.017100467944709204, 0.0, 0.0028249196730856323, 0.0,
                        0.0, 0.03226015138119164, 0.017316110667845514,
                        0.03204450865805533
                    ], 46373, 77991941653.19676),
                    ([
                        3119.4885286481917, 165.13178470083923,
                        11.672206122079334, 271.2690333876713,
                        39.407851838435064, 4959.81440560285,
                        212.5861709835175, 227.95909557447322,
                        148.6725381875264, 1613.4457676749382,
                        0.9052556903942522, 0.0, 0.09474430960574776, 0.0, 0.0,
                        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00037734709895550323,
                        0.0, 0.0, 0.0, 0.008346917828895732,
                        0.0021584254060254783, 0.0, 0.0, 0.0031395278633097865,
                        0.0, 0.0, 0.02815009358208054, 0.012512829801364487,
                        0.0, 0.13355068526233171, 0.11424560767976816,
                        0.008799734347642335, 0.0, 0.0018867354947775161,
                        0.0012226046006158305, 0.0, 0.44056028497252914,
                        0.10774014369377528, 0.0033810300066413087,
                        0.014580691903640641, 0.02313892410795146,
                        0.0002565960272897422, 3.018776791644026e-05, 0.0, 0.0,
                        0.06503954597597053, 0.022625732053371973,
                        0.008256354525146411
                    ], 66252, 74666940350.2879),
                ]

                ### print h2o.dump_json(kmeans)
                predictKey = 'd'
                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeansResult, csvPathname, parseResult, predictKey,
                    **kwargs)
                # all are multipliers of expected tuple value
                allowedDelta = (0.01, 0.01, 0.01)
                # these clusters were sorted compared to the cluster order in training
                h2o_kmeans.showClusterDistribution(self,
                                                   tupleResultList,
                                                   expected,
                                                   trial=trial)
                # why is the expected # of rows not right in KMeans2. That means predictions are wrong
                h2o_kmeans.compareResultsToExpected(self,
                                                    tupleResultList,
                                                    expected,
                                                    allowedDelta,
                                                    allowError=False,
                                                    allowRowError=True,
                                                    trial=trial)

                print "Trial #", trial, "completed\n"