def test_import_covtype_parse_loop(self):
        csvFilename = "covtype.data"
        importFolderPath = "/home/0xdiag/datasets/standard"
        trialMax = 2
        localhost = h2o.decide_if_localhost()
        for tryHeap in [4, 3, 2, 1]:
            print "\n", tryHeap, "GB heap, 1 jvms, import folder, then loop parsing 'covtype.data' to unique keys"
            if (localhost):
                h2o.build_cloud(node_count=1, java_heap_GB=tryHeap)
            else:
                h2o_hosts.build_cloud_with_hosts(
                    node_count=1, java_heap_GB=tryHeap)

            for trial in range(trialMax):
                # import each time, because h2o deletes source file after parse
                h2i.setupImportFolder(None, importFolderPath)
                key2 = csvFilename + "_" + str(trial) + ".hex"
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilename,
                    importFolderPath,
                    key2=key2,
                    timeoutSecs=20)
            # sticky ports?
            h2o.tear_down_cloud()
            time.sleep(2)
    def test_rf_allyears2k_oobe(self):
        importFolderPath = '/home/0xdiag/datasets'
        csvFilename = 'allyears2k.csv'
        csvPathname = importFolderPath + "/" + csvFilename
        h2i.setupImportFolder(None, importFolderPath)
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=60)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        for trial in range(10):
            kwargs = paramDict
            timeoutSecs = 30 + kwargs['ntree'] * 2

            start = time.time()
            # randomize the node
            node = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            rfView = h2o_cmd.runRFOnly(node=node, parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            classification_error = rfView['confusion_matrix']['classification_error']
            rows_skipped = rfView['confusion_matrix']['rows_skipped']
            mtry = rfView['mtry']
            mtry_nodes = rfView['mtry_nodes']
            print "mtry:", mtry
            print "mtry_nodes:", mtry_nodes
            self.assertEqual(classification_error, 0, "Should have zero oobe error")
            self.assertEqual(rows_skipped, 39, "Should have exactly 39 rows skipped")

            print "Trial #", trial, "completed"
示例#3
0
    def test_slice(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2o_import.setupImportFolder(None, importFolderPath)

        csvFilenameAll = [
            ("covtype.data", "cA", 5),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2o_import.parseImportFolderFile(None, 
                csvFilename, importFolderPath, key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['desination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # try the error case list
            # I suppose we should test the expected error is correct. 
            # Right now just make sure things don't blow up
            h2e.exec_expr_list_rand(lenNodes, exprErrorCaseList, key2, 
                maxCol=53, maxRow=400000, maxTrials=5, 
                timeoutSecs=timeoutSecs, ignoreH2oError=True)
            # we use colX+1 so keep it to 53
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=53, maxRow=400000, maxTrials=100, timeoutSecs=timeoutSecs)
示例#4
0
    def test_B_kmeans_benign(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = "benign.csv"
        key2 = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        h2i.setupImportFolder(None, importFolderPath)
        # FIX! key2 isn't working with Parse2 ? parseKey['destination_key'] not right?
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\nStarting", csvFilename

        expected = [
            ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
            ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
            ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,

        ]
        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        # loop, to see if we get same centers
        for trial in range(2):
            kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            # for fvec only?
            kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5})

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
    def test_exec_import_hosts_bigfiles(self):
        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 4000

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        # Update: need unique key names apparently. can't overwrite prior parse output key?
        # replicating lines means they'll get reparsed. good! (but give new key names)

        csvFilenameList = [
            ("covtype.data", "c"),
            ("covtype20x.data", "c20"),
            ("covtype200x.data", "c200"),
            ("billion_rows.csv.gz", "b"),
            ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, 
                csvFilename, importFolderPath, key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename
            exec_list(exprList, lenNodes, csvFilename, key2)
示例#6
0
    def test_C_kmeans_prostate(self):

        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = "prostate.csv"
        key2 = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename
        h2i.setupImportFolder(None, importFolderPath)
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=1, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\nStarting", csvFilename

        # loop, to see if we get same centers
        expected = [
            ([55.63235294117647], 68, 667.8088235294117) ,
            ([63.93984962406015], 133, 611.5187969924812) ,
            ([71.55307262569832], 179, 1474.2458100558654) ,
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)
        for trial in range(2):
            kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': 2, 'destination_key': 'prostate_k.hex',
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310}

            # for fvec only?
            kwargs.update({'max_iter': 50, 'max_iter2': 1, 'iterations': 5})

            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)

            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
示例#7
0
文件: test_slice.py 项目: askinss/h2o
    def test_exec_import_hosts(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2o_import.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameAll = [
            ("covtype.data", "cA", 5),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2o_import.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['desination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=53, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = ["covtype.data"]
        else:
            csvFilenameList = [
                "covtype200x.data",
                "covtype200x.data",
                "covtype.data",
                "covtype.data",
                "covtype20x.data",
                "covtype20x.data",
            ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = "/home/0xdiag/datasets/standard"
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60)
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey["destination_key"])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm["GLMModel"]
            coefficients = GLMModel["coefficients"]
            validationsList = GLMModel["validations"]
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, "err", validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write(".")
            sys.stdout.flush()
    def test_parse_covtype20x_loop(self):
        csvFilename = "covtype20x.data"
        importFolderPath = "/home/0xdiag/datasets"
        trialMax = 2
        for tryJvms in [1,2,3,4]:
            for tryHeap in [1,3]:
                print "\n", tryHeap,"GB heap,", tryJvms, "jvm per host, import folder,", \
                    "then loop parsing 'covtype20x.data' to unique keys"
                h2o_hosts.build_cloud_with_hosts(node_count=tryJvms, java_heap_GB=tryHeap)
                timeoutSecs=300
                for trial in range(trialMax):
                    # since we delete the key, we have to re-import every iteration, to get it again
                    h2i.setupImportFolder(None, importFolderPath)

                    key2 = csvFilename + "_" + str(trial) + ".hex"
                    start = time.time()
                    parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, 
                        timeoutSecs=timeoutSecs, retryDelaySecs=4, pollTimeoutSecs=60)
                    elapsed = time.time() - start
                    print "Trial #", trial, "completed in", elapsed, "seconds.", \
                        "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                    print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                          "Otherwise it would just parse the cached key."
                    storeView = h2o.nodes[0].store_view()
                    ### print "storeView:", h2o.dump_json(storeView)
                    # h2o removes key after parse now
                    ## print "Removing", parseKey['source_key']
                    ## removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['source_key'])
                    ### print "removeKeyResult:", h2o.dump_json(removeKeyResult)

                # sticky ports?
                h2o.tear_down_cloud()
                time.sleep(tryJvms * 5)
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'YearPredictionMSD.txt'
                ]
        else:
            csvFilenameList = [
                'YearPredictionMSD.txt'
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)

            # different when n_foldsidation is used? No trainingErrorDetails?
            h2o.verboseprint("\nglm:", glm)

            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
示例#11
0
    def test_B_importFolder_files(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "billion_rows.csv.gz",
        csvFilenameAll = [
            # quick test first
            "covtype.data", 
            # then the real thing
            "billion_rows.csv.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=500, pollTimeoutSecs=60)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # poker and the water.UDP.set3(UDP.java) fail issue..
            # constrain depth to 25

            # RF seems to get memory allocation errors on single machine (16GB dram)
            ### RFview = h2o_cmd.runRFOnly(trees=1,depth=5,parseKey=parseKey, timeoutSecs=timeoutSecs)
            ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

            # now some GLm
            kwargs = {'x': 0, 'y': 1, 'num_cross_validation_folds': 0, 'case_mode': '=', 'case': 1}
            # one coefficient is checked a little more
            colX = 0

            # L2 
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)

            sys.stdout.write('\n.')
            sys.stdout.flush() 
示例#12
0
    def test_sum_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            csvFilenameAll = [
                ("covtype.data", "cA", 5,  1),
                ("covtype.data", "cB", 5,  1),
                ("covtype.data", "cC", 5,  1),
            ]
        else:
            csvFilenameAll = [
                ("covtype.data", "cA", 5,  1),
                ("covtype20x.data", "cD", 50, 20),
                ("covtype200x.data", "cE", 50, 200),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            h2i.setupImportFolder(None, importFolderPath)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, 
                timeoutSecs=timeoutSecs)
            print "\n*************"
            print "colResultList", colResultList
            print "*************"

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0] 
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x)/resultMult for x in colResultList] 
                print "\n", good, "\n", compare
                self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
示例#13
0
    def test_from_import(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = '/home/0xdiag/datasets'

        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            # "covtype200x.data",
            # "100million_rows.csv",
            # "200million_rows.csv",
            # "a5m.csv",
            # "a10m.csv",
            # "a100m.csv",
            # "a200m.csv",
            # "a400m.csv",
            # "a600m.csv",
            # "billion_rows.csv.gz",
            # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # poker and the water.UDP.set3(UDP.java) fail issue..
            # constrain depth to 25
            RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey,
                timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            # wait in case it recomputes it
            time.sleep(10)

            sys.stdout.write('.')
            sys.stdout.flush() 
示例#14
0
    def test_KMeans_params_rand2(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 400),
                ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ('covtype200x.data', 2000),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params()
            for trial in range(3):
                randomV = paramDict['k']
                k = random.choice(randomV)

                randomV = paramDict['epsilon']
                epsilon = random.choice(randomV)

                randomV = paramDict['cols']
                cols = random.choice(randomV)

                kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'}
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
示例#15
0
    def test_rf_covtype_fvec(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=180)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        rfViewInitial = []
        for jobDispatch in range(1):
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + str(jobDispatch)
            
            # don't poll for fvec 
            rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs)
            elapsed = time.time() - start
            print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            print h2o.dump_json(rfResult)
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = key2
            rfView['model_key'] = kwargs['model_key']
            rfView['ntree'] = kwargs['ntree']
            rfViewInitial.append(rfView)

            print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch

            h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5)


        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntree = rfView['ntree']
            # allow it to poll to complete
            rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
    def test_from_import(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = '/home/0xdiag/datasets'

        timeoutSecs = 500

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "covtype20x.data", 
        #    "billion_rows.csv.gz",
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            # "covtype200x.data",
            # "100million_rows.csv",
            # "200million_rows.csv",
            # "a5m.csv",
            # "a10m.csv",
            # "a100m.csv",
            # "a200m.csv",
            # "a400m.csv",
            # "a600m.csv",
            # "billion_rows.csv.gz",
            # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for trial in range(3):
            for csvFilename in csvFilenameList:
                h2i.setupImportFolder(None, importFolderPath)
                # creates csvFilename.hex from file in importFolder dir 
                start = time.time()
                parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500)
                elapsed = time.time() - start
                print csvFilename, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n"
                print csvFilename, 'H2O reports parse time:', parseKey['response']['time']

                # h2o doesn't produce this, but h2o_import.py adds it for us.
                print "Parse result['source_key']:", parseKey['source_key']
                print "Parse result['destination_key']:", parseKey['destination_key']
                print "\n" + csvFilename

                storeView = h2o.nodes[0].store_view()
                ### print "storeView:", h2o.dump_json(storeView)
                # h2o deletes key after parse now
                ## print "Removing", parseKey['source_key'], "so we can re-import it"
                ## removeKeyResult = h2o.nodes[0].remove_key(key=parseKey['source_key'])
                ## print "removeKeyResult:", h2o.dump_json(removeKeyResult)

            print "\nTrial", trial, "completed\n"
    def test_import_multi_syn_datasets(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = '/home/0xdiag/datasets'

        print "This imports a folder of csv files..i.e points to syn_datasets with no regex"
        print "Doesn't put anything in syn_datasets. When run with import folder redirected"
        print "to import S3, there is a syn_datasets with 100 files"
        print "FIX! When run locally, I should have some multi-files in", importFolderPath, "/syn_datasets?" 
        timeoutSecs = 500
        if h2o.nodes[0].redirect_import_folder_to_s3_path:
            csvFilenameAll = [
                # FIX! ..just folder doesn't appear to work. add regex
                # need a destination_key...h2o seems to use the regex if I don't provide one
                ### "syn_datasets/*", 
                "syn_datasets/*_10000x200*", 
                ]
        else:
            csvFilenameAll = [
                # FIX! ..just folder doesn't appear to work. add regex
                # need a destination_key...h2o seems to use the regex if I don't provide one
                ### "syn_datasets/*", 
                "syn_datasets/*", 
                ]

        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            h2i.setupImportFolder(None, importFolderPath)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2="syn_datasets.hex",
                timeoutSecs=500)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "from all files num_rows:", "{:,}".format(inspect['num_rows']), \
                "num_cols:", "{:,}".format(inspect['num_cols'])

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            kwargs = {'sample': 75, 'depth': 25, 'ntree': 1}
            start = time.time()
            RFview = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            # so we can see!
            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            time.sleep(5)
示例#18
0
    def test_rf_kddcup_1999(self):
        # since we'll be waiting, pop a browser
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        csvFilename = 'kddcup_1999.data.gz'

        print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\
            "compared to running with the parameters specified and matching the browser RF query defaults. " +\
            "Also run the param for full scoring vs OOBE scoring."

        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=300)
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None,parseKey['destination_key'])

        for trials in range(4):
            print "\n" + csvFilename, "Trial #", trials
            start = time.time()

            kwargs = {
                'response_variable': 'classifier',
                'ntree': 200,
                'gini': 1,
                'class_weights': None,
                'stratify': 0,
                # 'features': None,
                'features': 7,
                'ignore': None,
                'sample': 67,
                'bin_limit': 1024,
                'depth': 2147483647,
                'seed': 784834182943470027,
                'parallel': 1,
                'exclusive_split_limit': None,
                }

            if trials == 0:
                kwargs = {}
            elif trials == 1:
                kwargs['out_of_bag_error_estimate'] = None
            elif trials == 2:
                kwargs['out_of_bag_error_estimate'] = 0
            elif trials == 3:
                kwargs['out_of_bag_error_estimate'] = 1

            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=50,parseKey=parseKey, 
                timeoutSecs=300, retryDelaySecs=1.0, **kwargs)
            print "RF end on ", csvFilename, 'took', time.time() - start, 'seconds'

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
示例#19
0
    def test_KMeans_winesPCA(self):
        if localhost:
            csvFilenameList = [
                #with winesPCA2.csv speciy cols = "1,2"
                ('winesPCA.csv', 480, 'cA'),
                ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('winesPCA.csv', 480,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        importFolderPath = os.path.abspath(h2o.find_file('smalldata'))
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, 'winesPCA.csv', importFolderPath, 
                timeoutSecs=2000, key2=key2) # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {
		#appears not to take 'cols'?
                'cols': None,
                'epsilon': 1e-6,
                'k': 3
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
	    print "Expected centers: [-2.276318, -0.965151], with 59 rows."
	    print "                  [0.0388763, 1.63886039], with 71 rows."
	    print "		     [2.740469, -1.237816], with 48 rows."
	    model_key = kmeans['destination_key']
	    kmeansScoreResult = h2o.nodes[0].kmeans_score(
	    	key = parseKey['destination_key'], model_key = model_key)
	    score  = kmeansScoreResult['score']
示例#20
0
    def test_GLM_covtype20x(self):
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 480, 'cA'),
                ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('covtype20x.data', 480,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=2000, key2=key2, noise=('JStack', None))
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {
                'cols': None,
                'epsilon': 1e-4,
                'k': 2
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)
示例#21
0
    def test_B_importFolder_files(self):
        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 1500

        csvFilenameAll = [
            # quick test first
            "covtype.data", 
            # then the real thing
            "billion_rows.csv.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=500, pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            kwargs = {'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1}
            # one coefficient is checked a little more
            colX = 0

            # L2 
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)

            sys.stdout.write('\n.')
            sys.stdout.flush() 
    def test_KMeans_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {'k': 1 }
                # 'destination_key': csvFilename + "_" + str(trial) + '.hex'}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeansGridOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
示例#23
0
    def test_KMeans_covtype20x(self):
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 480, 'cA'),
                ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('covtype20x.data', 480,'cA'),
                # ('covtype200x.data', 1000,'cE'),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=2000, key2=key2) # noise=('JStack', None)
            print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            kwargs = {
                'cols': None,
                'epsilon': 1e-4,
                'k': 2, 
                # reuse the same seed, to get deterministic results (otherwise sometimes fails
                'seed': 265211114317615310,
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
    def test_RF_poker_311M(self):
        # since we'll be waiting, pop a browser
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)

        csvFilename = 'new-poker-hand.full.311M.txt.gz'
        for trials in range(2):
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None,parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=5,depth=5,parseKey=parseKey, 
                timeoutSecs=600, retryDelaySecs=10.0)
            print "RF end on ", csvFilename, 'took', time.time() - start, 'seconds'
    def test_exec_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 15),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cB", 15),
                ("covtype20x.data", "cD", 60),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            cnum += 1
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # we use colX+1 so keep it to 53
            # we use factor in this test...so timeout has to be bigger!
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=(timeoutSecs))
示例#26
0
    def parseFile(self, s3bucket, localbucket, pathname, timeoutSecs, header, **kwargs):
        if USE_LOCAL: # this can get redirected to s3/s3n by jenkins
            (importFolderPath, csvFilename) = os.path.split("/" + localbucket + pathname)
            h2i.setupImportFolder(None, importFolderPath)
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=180)

        else:
            schema = "s3n://"
            bucket = s3bucket
            URI = schema + bucket + pathname
            importResult = h2o.nodes[0].import_hdfs(URI)
            start      = time.time()
            parseKey = h2o.nodes[0].parse("*" + pathname, timeoutSecs=timeoutSecs, header=header)

        parse_time = time.time() - start 
        h2o.verboseprint("py-S3 parse took {0} sec".format(parse_time))
        parseKey['python_call_timer'] = parse_time
        return parseKey
示例#27
0
    def test_short(self):
            csvFilename = 'part-00000b'
            ### csvFilename = 'short'
            importFolderPath = '/home/hduser/data'
            importFolderResult = h2i.setupImportFolder(None, importFolderPath)
            csvPathname = importFolderPath + "/" + csvFilename

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            start = time.time()
            # hardwire TAB as a separator, as opposed to white space (9)
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=500, separator=9)
            print "Parse of", parseKey['destination_key'], "took", time.time() - start, "seconds"

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=500)
            print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            # num_rows = inspect['num_rows']
            # num_cols = inspect['num_cols']

            keepPattern = "oly_|mt_|b_"
            y = "is_purchase"
            print "y:", y
            # don't need the intermediate Dicts produced from columnInfoFromInspect
            x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseKey['destination_key'], timeoutSecs=300)
            print "x:", x

            kwargs = {
                'x': x, 
                'y': y,
                # 'case_mode': '>',
                # 'case': 0,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.5,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 100,
                'beta_epsilon': 1.0E-4,
                }

            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs)
            elapsed = time.time() - start
            print "glm completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
示例#28
0
    def test_vector_filter_factor(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            maxTrials = 200
            csvFilenameAll = [
                ("covtype.data", "cA", 5),
                ("covtype.data", "cB", 5),
            ]
        else:
            maxTrials = 20
            csvFilenameAll = [
                ("covtype.data", "cA", 5),
                ("covtype20x.data", "cC", 50),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            # have to import each time, because h2o deletes the source file after parse
            h2i.setupImportFolder(None, importFolderPath)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # does n+1 so use maxCol 53
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=53, maxRow=400000, maxTrials=maxTrials, timeoutSecs=timeoutSecs)
    def test_import_covtype_parse_loop(self):
        csvFilename = "covtype.data"
        importFolderPath = "/home/0xdiag/datasets"
        trialMax = 2
        for tryHeap in [4,3,2,1]:
            print "\n", tryHeap,"GB heap, 2 jvms, import folder, then loop parsing 'covtype.data' to unique keys"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(2, java_heap_GB=tryHeap)
            else:
                h2o_hosts.build_cloud_with_hosts(node_count=2, java_heap_GB=tryHeap)

            h2i.setupImportFolder(None, importFolderPath)
            for trial in range(trialMax):
                key2 = csvFilename + "_" + str(trial) + ".hex"
                parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=20)
            # sticky ports?
            h2o.tear_down_cloud()
            print "Waiting 60 secs for TIME_WAIT sockets to go away"
            time.sleep(60)
示例#30
0
    def test_from_import(self):
        importFolderPath = "/home/0xdiag/datasets/standard"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 500
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            # "covtype200x.data",
            # "100million_rows.csv",
            # "200million_rows.csv",
            # "a5m.csv",
            # "a10m.csv",
            # "a100m.csv",
            # "a200m.csv",
            # "a400m.csv",
            # "a600m.csv",
            # "billion_rows.csv.gz",
            # "new-poker-hand.full.311M.txt.gz",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]
            inspect = h2o_cmd.runInspect(key=parseKey["destination_key"])

            print "\n" + csvFilename
            start = time.time()
            RFview = h2o_cmd.runRFOnly(trees=1, depth=25, parseKey=parseKey, timeoutSecs=timeoutSecs)

            h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            time.sleep(10)

            sys.stdout.write(".")
            sys.stdout.flush()
示例#31
0
    def test_GLM_covtype_train(self):
        print "\nMichal will hate me for another file needed: covtype.shuffled.data"
        importFolderPath = "/home/0xdiag/datasets"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None,
                                             csvFilename,
                                             importFolderPath,
                                             key2=key2,
                                             header=0,
                                             timeoutSecs=180)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0, 11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        # start at 90% rows + 1

        execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9] +
                                                                1) + ")"
        h2o_exec.exec_expr(None,
                           execExpr,
                           resultKey=dataKeyTest,
                           timeoutSecs=10)

        kwargs = {
            'y': 54,
            'max_iter': 20,
            'n_folds': 0,
            'thresholds': 0.5,
            'alpha': 0.1,
            'lambda': 1e-5,
            'family': 'binomial',
            'case_mode': '=',
            'case': 2
        }
        timeoutSecs = 60

        for trial in range(10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial % 10]
            resultKey = "r" + str(trial)
            execExpr = resultKey + " = slice(" + key2 + ",1," + str(
                rowsToUse) + ")"
            h2o_exec.exec_expr(None,
                               execExpr,
                               resultKey=resultKey,
                               timeoutSecs=10)
            parseKey['destination_key'] = resultKey
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
            print "glm end on ", parseKey[
                'destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            start = time.time()
            glmScore = h2o_cmd.runGLMScore(key=dataKeyTest,
                                           model_key=modelKey,
                                           thresholds="0.5",
                                           timeoutSecs=timeoutSecs)
            print "glmScore end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds'
            ### print h2o.dump_json(glmScore)
            classErr = glmScore['validation']['classErr']
            auc = glmScore['validation']['auc']
            err = glmScore['validation']['err']
            print "classErr:", classErr
            print "err:", err
            print "auc:", auc

            print "Trial #", trial, "completed", "using %6.2f" % (
                rowsToUse * 100.0 / num_rows), "pct. of all rows"
示例#32
0
    def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True):
        # the expected results are only for the shuffled version
        # since getting 10% samples etc of the smallish dataset will vary between 
        # shuffled and non-shuffled datasets
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on", csvFilename
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=180)

        inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = num_rows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # 0 isn't used
        expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79]
        expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        # start at 90% rows + 1
        execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9]+1) + ")"
        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10)

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]
        
        # don't use the smaller samples..bad error rates, plus for sorted covtype, you can get just one class!
        for trial in range(8,9):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 
            resultKey = "r_" + csvFilename + "_" + str(trial)
            execExpr = resultKey + " = slice(" + key2 + ",1," + str(rowsToUse) + ")"
            h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10)
            # hack so the RF will use the sliced result
            # FIX! don't use the sliced bit..use the whole data for rf training below
            ### parseKey['destination_key'] = resultKey

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + csvFilename + "_" + str(trial)
            # kwargs['model_key'] = "model"
            # double check the rows/cols
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            h2o_cmd.infoFromInspect(inspect, "going into RF")
            
            start = time.time()
            rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error'])
            if checkExpectedResults:
                self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                    msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA)
            actualTrainPctRightList.append(oobeTrainPctRight)
            


            print "Now score on the last 10%. Note this is silly if we trained on 100% of the data"
            print "Or sorted by output class, so that the last 10% is the last few classes"
            # pop the stuff from kwargs that were passing as params
            model_key = rfv['model_key']
            kwargs.pop('model_key',None)

            data_key = rfv['data_key']
            kwargs.pop('data_key',None)

            ntree = rfv['ntree']
            kwargs.pop('ntree',None)

            kwargs['iterative_cm'] = 1
            kwargs['no_confusion_matrix'] = 0

            # do full scoring
            kwargs['out_of_bag_error_estimate'] = 0
            # double check the rows/cols
            inspect = h2o_cmd.runInspect(key=dataKeyTest)
            h2o_cmd.infoFromInspect(inspect, "dataKeyTest")

            rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)

            fullScorePctRight = 100 * (1.0 - rfvScoring['confusion_matrix']['classification_error'])

            if checkExpectedResults:
                self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                    msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows"

        actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        # return the last rfv done during training
        return rfv
示例#33
0
    def test_GLM_100Mx70_hosts(self):
        # enable this if you need to re-create the file
        if 1 == 0:
            SYNDATASETS_DIR = h2o.make_syn_dir()
            createList = [
                (100000000, 70, 'cA', 10000),
            ]

            for (rowCount, colCount, key2, timeoutSecs) in createList:
                csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                    rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                print "Creating random", csvPathname
                SEEDPERFILE = random.randint(0, sys.maxint)
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            # Have to copy it to /home/0xdiag/datasets!

        if localhost:
            csvFilenameList = [
                # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'),
                # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'),
                ('rand_logreg_100000000x70.csv.gz', 500, 'rand_100Mx70.hex'),
            ]
        else:
            # None is okay for key2
            csvFilenameList = [
                # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'),
                # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'),
                ('rand_logreg_100000000x70.csv.gz', 500, 'rand_100Mx70.hex'),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000,
                                                 retryDelaySecs=5,
                                                 initialDelaySecs=10,
                                                 pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols)

            y = num_cols - 1
            kwargs = {
                'family': 'binomial',
                'link': 'logit',
                'y': y,
                'max_iter': 8,
                'n_folds': 0,
                'beta_epsilon': 1e-4,
                'alpha': 0,
                'lambda': 0
            }

            for trial in range(3):
                start = time.time()
                glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                         timeoutSecs=timeoutSecs,
                                         **kwargs)
                elapsed = time.time() - start
                print "glm", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.',
                print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
示例#34
0
    def test_KMeans_params_rand2(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 400),
            ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ('covtype200x.data', 2000),
            ]

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=2000,
                                                 pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params()
            for trial in range(3):
                randomV = paramDict['k']
                k = random.choice(randomV)

                randomV = paramDict['epsilon']
                epsilon = random.choice(randomV)

                randomV = paramDict['cols']
                cols = random.choice(randomV)

                kwargs = {
                    'k': k,
                    'epsilon': epsilon,
                    'cols': cols,
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,
                                             key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
示例#35
0
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = ['YearPredictionMSD.txt']
        else:
            csvFilenameList = ['YearPredictionMSD.txt']

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=120)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=2000,
                                     **kwargs)

            # different when n_foldsidation is used? No trainingErrorDetails?
            h2o.verboseprint("\nglm:", glm)

            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            print "GLM time", GLMModel['time']

            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations,
                                          validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients,
                                          coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush()
示例#36
0
    def test_poisson_covtype20x(self):
        if localhost:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ('covtype200x.data', 2000),
                ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            if (1==0):
                print "WARNING: just doing the first 33 features, for comparison to ??? numbers"
                # pythonic!
                x = ",".join(map(str,range(33)))
            else:
                x = ""

            print "WARNING: max_iter set to 8 for benchmark comparisons"
            max_iter = 8

            y = "54"

            kwargs = {
                'x': x,
                'y': y, 
                'family': 'binomial',
                'link': 'logit',
                'n_folds': 0, 
                'case_mode': '=', 
                'case': 1, 
                'max_iter': max_iter, 
                'beta_epsilon': 1e-3}

            # L2 
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

            # Elastic
            kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (Elastic) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)

            # L1
            kwargs.update({'alpha': 1.0, 'lambda': 1e-4})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L1) end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
    def test_rf_covtype_train_oobe3(self):
        print "\nUse randomFilter to sample the dataset randomly. then slice it"
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"

        h2i.setupImportFolder(None, importFolderPath)
        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=100)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = num_rows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79]
        expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        dataKeyTrain = "rTrain"

        # FIX! too many digits (10) in the 2nd param seems to cause stack trace
        execExpr = dataKeyTest + "=randomFilter(" + key2 + "," + str(pct10) + ",12345)"
        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10)

        execExpr = dataKeyTrain + "=randomFilter(" + key2 + "," + str(rowsForPct[9]) + ",12345)"
        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10)

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]
        
        for trial in range(1,10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 
            resultKey = "r" + str(trial)
            execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(rowsToUse) + ")"
            # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)"
            h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10)
            parseKey['destination_key'] = resultKey

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + str(trial)
            
            rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%"
            # pop the stuff from kwargs that were passing as params
            model_key = rfv['model_key']
            kwargs.pop('model_key',None)

            data_key = rfv['data_key']
            kwargs.pop('data_key',None)

            ntree = rfv['ntree']
            kwargs.pop('ntree',None)
            kwargs['iterative_cm'] = 1
            # do full scoring
            kwargs['out_of_bag_error_estimate'] = 0
            rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)

            fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows"

        actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp
示例#38
0
    def test_GLM_covtype2000x(self):
        if localhost:
            csvFilenameList = [
                # 68 secs on my laptop?
                ('covtype20x.data', 480, 'cA'),
            ]
        else:
            # None is okay for key2
            csvFilenameList = [
                ('covtype2000x.data', 3600, 'cA'),
                # ('covtype200x.data', 1000,'cE'),
            ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = '/home2/0xdiag/datasets'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs, key2 in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=2000,
                                                 key2=key2)
            print "parse end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o.check_sandbox_for_errors()

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            if (1 == 0):
                print "WARNING: just first 33 features. Comparison to allstate"
                # pythonic!
                x = ",".join(map(str, range(33)))
            else:
                x = ""

            y = "54"

            kwargs = {
                'x': x,
                'y': y,
                'family': 'binomial',
                'link': 'logit',
                'n_folds': 0,
                'case_mode': '=',
                'case': 1,
                'max_iter': 8,
                'beta_eps': 1e-3
            }
            print "WARNING: max_iter set to 8 for benchmark comparisons"

            # L2
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            print "glm (L2) end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
            h2o.check_sandbox_for_errors()

            # Elastic
            kwargs.update({'alpha': 0.5, 'lambda': 1e-4})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            print "glm (Elastic) end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
            h2o.check_sandbox_for_errors()

            # L1
            kwargs.update({'alpha': 1.0, 'lambda': 1e-4})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            print "glm (L1) end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
            h2o.check_sandbox_for_errors()
示例#39
0
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000
        avgMichalSize = 116561140
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize = 183
        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800),
                ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz",
                 100 * avgMichalSize, 1800),
                ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz",
                 200 * avgMichalSize, 1800),
                ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz",
                 200 * avgMichalSize, 1800),
                ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz",
                 200 * avgMichalSize, 1800),
                ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz",
                 200 * avgMichalSize, 1800),
                # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600),
            ]

        if 1 == 1:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600),
                # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600),
                # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600),
                # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600),
                # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800),
                #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600),
                # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz",
                 100 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz",
                 100 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz",
                 120 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz",
                 120 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz",
                 140 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz",
                 140 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz",
                 160 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz",
                 160 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                # for now, take too long on 2x100GB heap on 164
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
            ]

        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz",
                 50 * avgMichalSize, 3600),
            ]

        if 1 == 0:
            importFolderPath = '/home2/0xdiag/datasets'
            print "Using non-.gz'ed files in", importFolderPath
            csvFilenameAll = [
                # I use different files to avoid OS caching effects
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200),
                # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700),
            ]
        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/standard'
            print "Using .gz'ed files in", importFolderPath
            # all exactly the same prior to gzip!
            # could use this, but remember import folder -> import folder s3 for jenkins?
            # how would it get it right?
            # os.path.getsize(f)
            csvFilenameAll = [
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700),
                # 100 files takes too long on two machines?
                # ("covtype200x.data", "covtype200x.data", 15033863400, 700),
                # I use different files to avoid OS caching effects
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700),
                # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700),
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz",
                 1 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz",
                 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz",
                 20 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz",
                 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz",
                 "file_100.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz",
                 "file_200.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz",
                 "file_300.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz",
                 100 * avgMichalSize, 1200),
                ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),

                # do it twice
                # ("covtype.data", "covtype.data"),
                # ("covtype20x.data", "covtype20x.data"),
                # "covtype200x.data",
                # "100million_rows.csv",
                # "200million_rows.csv",
                # "a5m.csv",
                # "a10m.csv",
                # "a100m.csv",
                # "a200m.csv",
                # "a400m.csv",
                # "a600m.csv",
                # "billion_rows.csv.gz",
                # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks'
        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails'
        jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
        jea = ' -Dcom.sun.management.jmxremote.port=54330' + \
              ' -Dcom.sun.management.jmxremote.authenticate=false' + \
              ' -Dcom.sun.management.jmxremote.ssl=false'  + \
              ' -Dcom.sun.management.jmxremote' + \
              ' -Dcom.sun.management.jmxremote.local.only=false'
        jea = ' -Dlog.printAll=true'

        for i, (csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(
                    2,
                    java_heap_GB=tryHeap,
                    base_port=base_port,
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            else:
                h2o_hosts.build_cloud_with_hosts(
                    base_port=base_port,
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2

            for trial in range(trialMax):
                importFolderResult = h2i.setupImportFolder(
                    None, importFolderPath)
                importFullList = importFolderResult['files']
                importFailList = importFolderResult['fails']
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                    importFailList)
                # creates csvFilename.hex from file in importFolder dir

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message(
                    "Parse " + csvFilename +
                    " Start--------------------------------")
                start = time.time()
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilepattern,
                    importFolderPath,
                    key2=csvFilename + ".hex",
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                if noPoll:
                    if (i + 1) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2,
                         timeoutSecs) = csvFilenameList[i + 1]
                        parseKey = h2i.parseImportFolderFile(
                            None,
                            csvFilepattern,
                            importFolderPath,
                            key2=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                    if (i + 2) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3,
                         timeoutSecs) = csvFilenameList[i + 2]
                        parseKey = h2i.parseImportFolderFile(
                            None,
                            csvFilepattern,
                            importFolderPath,
                            key2=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                                          timeoutSecs=timeoutSecs,
                                          benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                if totalBytes is not None:
                    fileMBS = (totalBytes / 1e6) / elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB,
                        csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response'][
                    'time']
                print "Parse result['destination_key']:", parseKey[
                    'destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.columnInfoFromInspect(
                        parseKey['destination_key'],
                        exceptionOnMissingValues=False)

                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)'
                execExpr = 'a = slice(' + origKey + ',1,200)'
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542)  # don't include the output column
                    # remove the output too! (378)
                    for i in [
                            3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19,
                            20, 424, 425, 426, 540, 541, 378
                    ]:
                        x.remove(i)
                    x = ",".join(map(str, x))

                    GLMkwargs = {
                        'x': x,
                        'y': 378,
                        'case': 15,
                        'case_mode': '>',
                        'max_iter': 10,
                        'n_folds': 1,
                        'alpha': 0.2,
                        'lambda': 1e-5
                    }
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                             timeoutSecs=timeoutSecs,
                                             **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB,
                        csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.checkKeyDistribution()
                h2o_cmd.deleteCsvKey(csvFilename, importFolderResult)
                ### time.sleep(3600)
                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush()
示例#40
0
    def test_RF_mnist_reals(self):
        importFolderPath = "/home/0xdiag/datasets/mnist"
        csvFilelist = [
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            # ("a.csv", "b.csv", 60),
            # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz",    600),
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 testCsvFilename,
                                                 importFolderPath,
                                                 key2=testKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 trainCsvFilename,
                                                 importFolderPath,
                                                 key2=trainKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseKey['destination_key'],
                timeoutSecs=300,
                forRF=True)
            ntree = 10
            params = {
                'response_variable':
                0,
                'ignore':
                ignore_x,
                'ntree':
                ntree,
                'iterative_cm':
                1,
                'out_of_bag_error_estimate':
                1,
                # 'data_key='mnist_reals_training.csv.hex'
                'features':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'exclusive_split_limit':
                None,
                'depth':
                2147483647,
                'stat_type':
                'ENTROPY',
                'sampling_strategy':
                'RANDOM',
                'sample':
                67,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'model_key':
                'RF_model',
                'bin_limit':
                1024,
                'seed':
                784834182943470027,
                'parallel':
                1,
                'use_non_local_data':
                0,
                'class_weights':
                '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRFOnly(parseKey=parseKey,
                                       rfView=True,
                                       timeoutSecs=timeoutSecs,
                                       pollTimeoutsecs=60,
                                       retryDelaySecs=2,
                                       **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['model_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response_variable': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2,
                                       model_key=modelKey,
                                       ntree=ntree,
                                       out_of_bag_error_estimate=0,
                                       timeoutSecs=60,
                                       pollTimeoutSecs=60,
                                       noSimpleCheck=False,
                                       **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            self.assertAlmostEqual(
                classification_error,
                0.03,
                delta=0.5,
                msg="Classification error %s differs too much" %
                classification_error)
            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
示例#41
0
    def test_B_importFolder_files(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 900

        #    "covtype169x.data",
        #    "covtype.13x.shuffle.data",
        #    "3G_poker_shuffle"
        #    "billion_rows.csv.gz",
        csvFilenameAll = [
            # quick test first
            "covtype.data", 
            # then the real thing
            "billion_rows.csv.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=500, pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)


            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])

            print "\n" + csvFilename
            start = time.time()
            # poker and the water.UDP.set3(UDP.java) fail issue..
            # constrain depth to 25

            # RF seems to get memory allocation errors on single machine (16GB dram)
            ### RFview = h2o_cmd.runRFOnly(trees=1,depth=5,parseKey=parseKey, timeoutSecs=timeoutSecs)
            ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

            # now some GLm
            kwargs = {'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1}
            # one coefficient is checked a little more
            colX = 0

            # L2 
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)

            sys.stdout.write('\n.')
            sys.stdout.flush() 
示例#42
0
    def test_four_billion_rows(self):
        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/billions"
        h2i.setupImportFolder(None, importFolderPath)
        timeoutSecs = 1500

        csvFilenameAll = [
            "four_billion_rows.csv",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            start = time.time()

            # Parse*********************************
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=timeoutSecs,
                                                 pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            value_size_bytes = inspect['value_size_bytes']
            row_size = inspect['row_size']
            print "\n" + csvFilename, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols), \
                "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                "    row_size:", "{:,}".format(row_size)

            expectedRowSize = num_cols * 1  # plus output
            expectedValueSize = expectedRowSize * num_rows
            self.assertEqual(row_size, expectedRowSize,
                msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                (row_size, expectedRowSize))
            self.assertEqual(value_size_bytes, expectedValueSize,
                msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                (value_size_bytes, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(key=parseKey['destination_key'],
                                               timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                num_cols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, num_cols))
            self.assertEqual(4 * 1000000000,
                             num_rows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, num_rows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'epsilon': 1e-6,
                'max_iter': 20,
                'cols': None,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'x': 0,
                'y': 1,
                'n_folds': 0,
                'case_mode': '=',
                'case': 1
            }
            # one coefficient is checked a little more
            colX = 0

            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
示例#43
0
    def test_GLM_mnist(self):
        importFolderPath = "/home/0xdiag/datasets/mnist"
        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        succeededList = importFolderResult['files']
        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath,
                key2=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath,
                key2=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)
            print "x:", x

            params = {
                'x': x, 
                'y': y,
                'case_mode': '=',
                'case': 0,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.0,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
                }

            for c in [0,1,2,3,4,5,6,7,8,9]:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                kwargs['case'] = c

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                GLMModel = glm['GLMModel']
                modelKey = GLMModel['model_key']

                start = time.time()
                glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5",
                    timeoutSecs=60)
                elapsed = time.time() - start
                print "GLMScore in",  elapsed, "secs", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
示例#44
0
    def test_parse_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # just do the import folder once
        importFolderPath = "/home/0xdiag/datasets/libsvm"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("covtype.binary.svm", "cC", 30, 1, 2, True, True),
            ("mnist_train.svm", "cM", 30, 0, 9, False, False),
            # multi-label target like 1,2,5 ..not sure what that means
            # ("tmc2007_train.svm",  "cJ", 30, 0, 21.0, False, False),
            # illegal non-ascending cols
            # ("syn_6_1000_10.svm",  "cK", 30, -36, 36, True, False),
            # ("syn_0_100_1000.svm", "cL", 30, -36, 36, True, False),
            # fails csvDownload
            ("duke.svm", "cD", 30, -1.000000, 1.000000, False, False),
            ("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False),
            ("news20.svm", "cH", 30, 1, 20, False, False),
            ("connect4.svm", "cB", 30, -1, 1, False, False),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False)
            ("gisette_scale.svm", "cF", 30, -1, 1, False, False),
            ("mushrooms.svm", "cG", 30, 1, 2, False, False),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, key2, timeoutSecs, expectedCol0Min, expectedCol0Max,
             enableDownloadReparse, enableSizeChecks) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            h2i.setupImportFolder(None, importFolderPath)
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000)
            print csvPathname, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # INSPECT******************************************
            start = time.time()
            inspectFirst = h2o_cmd.runInspect(None,
                                              parseKey['destination_key'],
                                              timeoutSecs=360)
            print "Inspect:", parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"
            h2o_cmd.infoFromInspect(inspectFirst, csvFilename)
            # look at the min/max for the target col (0) and compare to expected for the dataset

            imin = inspectFirst['cols'][0]['min']
            imax = inspectFirst['cols'][0]['max']

            if expectedCol0Min:
                self.assertEqual(
                    imin,
                    expectedCol0Min,
                    msg='col %s min %s is not equal to expected min %s' %
                    (0, imin, expectedCol0Min))
            if expectedCol0Max:
                self.assertEqual(
                    imax,
                    expectedCol0Max,
                    msg='col %s max %s is not equal to expected max %s' %
                    (0, imax, expectedCol0Max))

            print "\nmin/max for col0:", imin, imax

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values,
            # to see if we have x specified well
            # figures out everything from parseKey['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            if DO_SUMMARY:
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0,
                    key=parseKey['destination_key'],
                    timeoutSecs=300,
                    noPrint=True)
                summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            if DO_DOWNLOAD_REPARSE and enableDownloadReparse:
                missingValuesListA = h2o_cmd.infoFromInspect(
                    inspectFirst, csvPathname)
                num_colsA = inspectFirst['num_cols']
                num_rowsA = inspectFirst['num_rows']
                row_sizeA = inspectFirst['row_size']
                value_size_bytesA = inspectFirst['value_size_bytes']

                # do a little testing of saving the key as a csv
                csvDownloadPathname = SYNDATASETS_DIR + "/" + csvFilename + "_csvDownload.csv"
                print "Trying csvDownload of", csvDownloadPathname
                h2o.nodes[0].csv_download(key=key2,
                                          csvPathname=csvDownloadPathname)

                # remove the original parsed key. source was already removed by h2o
                # don't have to now. we use a new name for key2B
                # h2o.nodes[0].remove_key(key2)
                start = time.time()
                key2B = key2 + "_B"
                parseKeyB = h2o_cmd.parseFile(csvPathname=csvDownloadPathname,
                                              key2=key2B)
                print csvDownloadPathname, "download/reparse (B) parse end. Original data from", \
                    csvFilename, 'took', time.time() - start, 'seconds'
                inspect = h2o_cmd.runInspect(key=key2B)

                missingValuesListB = h2o_cmd.infoFromInspect(
                    inspect, csvPathname)
                num_colsB = inspect['num_cols']
                num_rowsB = inspect['num_rows']
                row_sizeB = inspect['row_size']
                value_size_bytesB = inspect['value_size_bytes']

                df = h2o_util.JsonDiff(inspectFirst, inspect, with_values=True)
                print "df.difference:", h2o.dump_json(df.difference)

                for i, d in enumerate(df.difference):
                    # ignore mismatches in these
                    #  "variance"
                    #  "response.time"
                    #  "key"
                    if "variance" in d or "response.time" in d or "key" in d or "value_size_bytes" in d or "row_size" in d:
                        pass
                    else:
                        raise Exception(
                            "testing %s, found unexpected mismatch in df.difference[%d]: %s"
                            % (csvPathname, i, d))

                if DO_SIZE_CHECKS and enableSizeChecks:
                    # if we're allowed to do size checks. ccompare the full json response!
                    print "Comparing original inspect to the inspect after parsing the downloaded csv"
                    # vice_versa=True
                    self.assertGreater(len(df.difference), 29,
                        msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \
                            (len(df.difference), h2o.dump_json(df.difference)))

                    # this fails because h2o writes out zeroes as 0.0000* which gets loaded as fp even if col is all zeroes
                    # only in the case where the libsvm dataset specified vals = 0, which shouldn't happen
                    # make the check conditional based on the dataset
                    self.assertEqual(
                        row_sizeA, row_sizeB,
                        "row_size mismatches after re-parse of downloadCsv result %d %d"
                        % (row_sizeA, row_sizeB))
                    self.assertEqual(
                        value_size_bytesA, value_size_bytesB,
                        "value_size_bytes mismatches after re-parse of downloadCsv result %d %d"
                        % (value_size_bytesA, value_size_bytesB))

                print "missingValuesListA:", missingValuesListA
                print "missingValuesListB:", missingValuesListB
                self.assertEqual(
                    missingValuesListA, missingValuesListB,
                    "missingValuesList mismatches after re-parse of downloadCsv result"
                )
                self.assertEqual(
                    num_colsA, num_colsB,
                    "num_cols mismatches after re-parse of downloadCsv result %d %d"
                    % (num_colsA, num_colsB))
                self.assertEqual(
                    num_rowsA, num_rowsB,
                    "num_rows mismatches after re-parse of downloadCsv result %d %d"
                    % (num_rowsA, num_rowsB))

            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
示例#45
0
    def test_KMeans_sphere15_180GB(self):
        csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "/datasets/kmeans_big"
            csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([
                0.0, -113.00566692375459, -89.99595447985321,
                -455.9970643424373, 4732.0, 49791778.0, 36800.0
            ], 248846122, 1308149283316.2988),
            ([
                0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                25654042.00592703, 28304.0
            ], 276924291, 1800760152555.98),
            ([
                0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                31319.99486705394
            ], 235089554, 375419158808.3253),
            ([
                0.0, 10.0, -72.00113070337981, -171.0198611715457,
                4430.00952228909, 37007399.0, 29894.0
            ], 166180630, 525423632323.6474),
            ([
                0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                22865824.99639042, 5335.0
            ], 167234179, 1845362026223.1094),
            ([
                0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                -47537.998050740985
            ], 195420925, 197941282992.43475),
            ([
                0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289,
                1928.0, 39967190.0, 27202.0
            ], 214401768, 11868360232.658035),
            ([
                0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                30712.99115201907
            ], 258853406, 598863991074.3276),
            ([
                0.0, 21.0, 114.01584574295777, 242.99690338815898,
                1674.0029079209912, 33089556.0, 36415.0
            ], 190979054, 1505088759456.314),
            ([
                0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                -48473733.04122273, 47343.0
            ], 87794427, 1124697008162.3955),
            ([
                0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                16716.003410920028
            ], 78226988, 1151439441529.0215),
            ([
                0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                -14930.007919032574
            ], 167273589, 693036940951.0249),
            ([
                0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                11767.998552236539
            ], 148426180, 35942838893.32379),
            ([
                0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                -23336.998167498707
            ], 157533313, 88431531357.62982),
            ([
                0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008,
                2320.0, 46602185.0, 11212.0
            ], 118361306, 1111537045743.7646),
        ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            if FROM_HDFS:
                importFolderResult = h2i.setupImportHdfs(
                    None, importFolderPath)
            else:
                importFolderResult = h2i.setupImportFolder(
                    None, importFolderPath)

            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            key2 = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseKey = h2i.parseImportHdfsFile(
                    None,
                    csvFilename,
                    importFolderPath,
                    key2=key2,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutsecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs)
            else:
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilename,
                    importFolderPath,
                    key2=key2,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutsecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            # KMeans ****************************************
            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15,
                'initialization': 'Furthest',
                'epsilon': 1e-6,
                'cols': None,
                'destination_key': 'junk.hex',
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial % 3) == 1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey,
                                           timeoutSecs=timeoutSecs,
                                           benchmarkLogging=benchmarkLogging,
                                           **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans",
                "trial " + str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeans, csvPathname, parseKey, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                allowError=True,
                                                trial=trial)