Пример #1
0
    def test_GBMGrid_basic_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename 
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        
        # check the first in the models list. It should be the best
        colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ]
        modelKey = 'GBMGrid_benign'

        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'STR',
            'learn_rate': '.1,.2,.25',
            'ntrees': '3:5:1',
            'max_depth': '5,7',
            'min_rows': '1,2',
            'response': 'FNDX',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."
        h2o_gbm.showGBMGridResults(GBMResult, 0)
Пример #2
0
    def test_GBMGrid_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = "logreg/" + csvFilename
        parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put")
        colNames = ["ID", "CAPSULE", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]

        modelKey = "GBMGrid_prostate"
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            "destination_key": modelKey,
            "ignored_cols_by_name": "ID",
            "learn_rate": ".1,.2",
            "ntrees": "1:3:1",
            "max_depth": "8,9",
            "min_rows": "1:5:2",
            "response": "CAPSULE",
            "classification": 1 if DO_CLASSIFICATION else 0,
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."
        h2o_gbm.showGBMGridResults(GBMResult, 10)
Пример #3
0
    def test_GBMGrid_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': '.1,.2',
            'ntrees': '1:3:1',
            'max_depth': '8,9',
            'min_rows': '1:5:2',
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."
        h2o_gbm.showGBMGridResults(GBMResult, 15)
Пример #4
0
    def test_GBM_parseTrain(self):
        bucket = 'home-0xdiag-datasets'
       
        files = [('standard', 'covtype.data', 'covtype.hex', 1800, 54)
                ]
                  
        for importFolderPath,csvFilename,trainKey,timeoutSecs,response in files:
            # PARSE train****************************************
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename,
                hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            params = { 
                'destination_key': "GBMKEY",
                'learn_rate':.1,
                'ntrees':1,
                'max_depth':1,
                'min_rows':1,
                'response':response
            }   
            print "Using these parameters for GBM: ", params
            kwargs = params.copy()
            #noPoll -> False when GBM finished
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs)
            h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800)
            #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \
            #    "%f pct. of timeout" % (GBMResult['python_%timeout'])
            GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
            print GBMView['gbm_model']['errs']
Пример #5
0
    def test_GBMGrid_basic_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': '4,100',
            'max_depth': 8,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)

            statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            # shouldn't need this?
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        # FIX! after gbm grid, have to get the model keys from the json?
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey)
        print h2o.dump_json(gbmGridView)

        if 1==0:
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Пример #6
0
    def test_GBM_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilename = "mnist_training.csv.gz"
        timeoutSecs = 1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(
            bucket="home-0xdiag-datasets",
            path=importFolderPath + "/" + csvFilename,
            schema="put",
            hex_key=trainKey,
            timeoutSecs=timeoutSecs,
        )

        elapsed = time.time() - start
        print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
            (elapsed * 100) / timeoutSecs
        )
        print "parse result:", parseResult["destination_key"]

        # GBM (train)****************************************
        modelKey = "GBM_model"
        params = {
            "classification": 1,  # faster?
            "destination_key": modelKey,
            "learn_rate": 0.1,
            "ntrees": 3,
            "max_depth": 8,
            "min_rows": 1,
            "response": 0,  # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        # noPoll -> False when GBM finished
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
        h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs)

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView["gbm_model"]["errs"][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cm = gbmTrainView["gbm_model"]["cms"][-1]  # use the last one
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"])
Пример #7
0
    def test_GBM_parseTrain(self):
        #folderpath, filename, keyname, timeout
        bucket = 'home-0xdiag-datasets'
        
        files = [('mnist', 'mnist_training.csv.gz', 'mnistsmalltrain.hex',1800,0)
                ]

        grid = [[1,10,100,1000], [0.0,0.01,0.001,0.0001,1], [1,2], [1,10,100]] 
        grid = list(itertools.product(*grid))
        grid = random.sample(grid, 10) #don't do all 120, but get a random sample
        for importFolderPath,csvFilename,trainKey,timeoutSecs,response in files:
            # PARSE train****************************************
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']
            csv_header = ('nJVMs','java_heap_GB', 'dataset', 'ntrees', 'max_depth', 'learn_rate', 'min_rows','trainTime')
            for ntree, learn_rate, max_depth, min_rows in grid:
                if not os.path.exists('gbm_grid.csv'):
                    output = open('gbm_grid.csv', 'w')
                    output.write(','.join(csv_header)+'\n')
                else:
                    output = open('gbm_grid.csv', 'a')

                csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None,
                        dialect='excel', extrasaction='ignore',delimiter=',')
                java_heap_GB = h2o.nodes[0].java_heap_GB

                params = {
                 'destination_key': 'GBMKEY',
                 'learn_rate': learn_rate,
                 'ntrees':ntree,
                 'max_depth':max_depth,
                 'min_rows':min_rows,
                 'response':response
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True
                #noPoll -> False when GBM finished
                start = time.time()
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult,noPoll=True,timeoutSecs=timeoutSecs,**kwargs)
                h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=3600,pollTimeoutSecs=3600)
                #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \
                #    "%f pct. of timeout" % (GBMResult['python_%timeout'])
                #print GBMResult
                GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                print GBMView['gbm_model']['errs']
                elapsed = time.time() - start
                row = {'nJVMs':len(h2o.nodes),'java_heap_GB':java_heap_GB,'dataset':'mnist_training.csv.gz',
                       'learn_rate':learn_rate,'ntrees':ntree,'max_depth':max_depth,
                       'min_rows':min_rows, 'trainTime':elapsed}
                print row
                csvWrt.writerow(row)
Пример #8
0
    def test_GBMGrid_basic_many(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': '.1,.2',
            'ntrees': '8,10',
            'max_depth': '8,9',
            'min_rows': '1,2',
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            'grid_parallelism': 1,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
    
        jobs = []
        # kick off 5 of these GBM grid jobs, with different tree choices
        start = time.time()
        totalGBMGridJobs = 0
        # for more in range(8):
        # fast
        # for more in range(9):
        for i in range(5):
            kwargs = params.copy()
            kwargs['min_rows'] = '1,2,3'
            if DO_FROM_TO_STEP:
                kwargs['max_depth'] = '5:10:1'
            else:
                kwargs['max_depth'] = '5,6,10'

            GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
            # print "GBMResult:", h2o.dump_json(GBMResult)
            job_key = GBMResult['job_key']
            model_key = GBMResult['destination_key']
            jobs.append( (job_key, model_key) )
            totalGBMGridJobs += 1

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        for job_key, model_key  in jobs:
            GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key, destination_key=model_key)
            h2o_gbm.showGBMGridResults(GBMResult, 15)

        print "All GBM jobs completed in", elapsed, "seconds."
        print "totalGBMGridJobs:", totalGBMGridJobs
Пример #9
0
    def test_c9b_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [
                 ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, 
                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            # passes 5, fails 15
            # for depth in [5,15,25,40]:
            for depth in [5,5,5,5,5]:
                params = {
                    'destination_key': "GBMKEY",
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': 10,
                    'max_depth': depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                start = time.time()
                print "Start time is: ", time.time()
                #noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs)

                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
                num_cpus = statMean['num_cpus'],
                my_cpu_pct = statMean['my_cpu_%'],
                sys_cpu_pct = statMean['sys_cpu_%'],
                system_load = statMean['system_load']
                # shouldn't need this?
                h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)

                h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800)
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                #print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Пример #10
0
    def test_GBM_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilename = "mnist_training.csv.gz"
        timeoutSecs=1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',  path=importFolderPath + "/" + csvFilename, schema='put',
            hex_key=trainKey, timeoutSecs=timeoutSecs)

        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        # GBM (train)****************************************
        modelKey = "GBM_model"
        params = { 
            'classification': 1, # faster? 
            'destination_key': modelKey,
            'learn_rate': .1,
            'ntrees': 3,
            'max_depth': 8,
            'min_rows': 1,
            'response': 0, # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        #noPoll -> False when GBM finished
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
        h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cms = gbmTrainView['gbm_model']['cms']
            cm = cms[-1]['_arr'] # use the last one
            print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr']
            print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr']
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Пример #11
0
    def test_GBM_covtype_train_test(self):

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'
        trainFilename = 'covtype.shuffled.90pct.data'
        trainKey = 'covtype.train.hex'
        modelKey = 'GBMModelKey'
        timeoutSecs = 1800

        parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
            hex_key=trainKey, timeoutSecs=timeoutSecs)

        kwargs = {
            'max_depth': 7,
            # 'max_depth': 10,
            'ntrees': 2,
            'response': 'C55'
        }

        h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
Пример #12
0
    def test_GBMGrid_basic_benign(self):
        h2o.beta_features = True
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename 
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one
        
        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"
        # check the first in the models list. It should be the best
        colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ]
        modelKey = 'GBMGrid_benign'

        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'STR',
            'learn_rate': '.1,.2',
            'ntrees': 2,
            'max_depth': 8,
            'min_rows': 1,
            'response': 'FNDX',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
        # no pattern waits for all
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey)

        if 1==0:
            # FIX! get model?
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Пример #13
0
    def test_GBMScore(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvTrainPath = importFolderPath + "/allyears2k.csv"
        csvTestPath = csvTrainPath
        #        importFolderPath = 'newairlines'
        #        csvTrainPath = importFolderPath + '/train/*train*'
        #        csvTestPath  = importFolderPath + '/train/*test*'
        trainhex = "train.hex"
        testhex = "test.hex"
        parseTrainResult = h2i.import_parse(
            bucket="home-0xdiag-datasets",
            path=csvTrainPath,
            schema="local",
            hex_key=trainhex,
            timeoutSecs=2400,
            doSummary=False,
        )
        parseTestResult = h2i.import_parse(
            bucket="home-0xdiag-datasets",
            path=csvTestPath,
            schema="local",
            hex_key=testhex,
            timeoutSecs=2400,
            doSummary=False,
        )
        inspect_test = h2o.nodes[0].inspect(testhex, timeoutSecs=8000)
        response = "IsDepDelayed"
        ignored_cols = "DepTime,ArrTime,FlightNum,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed"

        params = {
            "destination_key": "GBMScore",
            "response": response,
            "ignored_cols_by_name": ignored_cols,
            "classification": 1,
            "validation": None,
            "ntrees": 100,
            "max_depth": 10,
            "learn_rate": 0.00005,
        }

        parseResult = {"destination_key": trainhex}
        kwargs = params.copy()
        gbm = h2o_cmd.runGBM(parseResult=parseResult, timeoutSecs=4800, **kwargs)

        scoreStart = time.time()
        h2o.nodes[0].generate_predictions(model_key="GBMScore", data_key=trainhex)
        scoreElapsed = time.time() - scoreStart

        print "It took ", scoreElapsed, " seconds to score ", inspect_test[
            "numRows"
        ], " rows. Using a GBM with 100 10-deep trees."
        print "That's ", 1.0 * scoreElapsed / 100.0, " seconds per 10-deep tree."
Пример #14
0
    def test_c9_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "parse result:", parseResult["destination_key"]

            # GBM (train)****************************************
            for depth in [5, 15]:
                params = {
                    "destination_key": "GBMKEY",
                    "learn_rate": 0.2,
                    "nbins": 1024,
                    "ntrees": 10,
                    "max_depth": depth,
                    "min_rows": 10,
                    "response": response,
                    "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed",
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                timeoutSecs = 1800
                start = time.time()
                print "Start time is: ", time.time()
                # noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs)
                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
                num_cpus = (statMean["num_cpus"],)
                my_cpu_pct = (statMean["my_cpu_%"],)
                sys_cpu_pct = (statMean["sys_cpu_%"],)
                system_load = statMean["system_load"]
                # shouldn't need this?
                h2j.pollWaitJobs(
                    pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5
                )
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                # GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                # print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Пример #15
0
    def test_GBMGrid_basic_benign(self):
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        csvPathname = "logreg/" + csvFilename
        parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put")
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one

        # check the first in the models list. It should be the best
        colNames = [
            "STR",
            "OBS",
            "AGMT",
            "FNDX",
            "HIGD",
            "DEG",
            "CHK",
            "AGP1",
            "AGMN",
            "NLV",
            "LIV",
            "WT",
            "AGLP",
            "MST",
        ]
        modelKey = "GBMGrid_benign"

        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            "destination_key": modelKey,
            "ignored_cols_by_name": "STR",
            "learn_rate": ".1,.2,.25",
            "ntrees": "3:5:1",
            "max_depth": "5,7",
            "min_rows": "1,2",
            "response": "FNDX",
            "classification": 1 if DO_CLASSIFICATION else 0,
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."
        h2o_gbm.showGBMGridResults(GBMResult, 0)
Пример #16
0
    def test_GBMGrid_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = "logreg/" + csvFilename
        parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put")
        colNames = ["ID", "CAPSULE", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"]

        modelKey = "GBMGrid_prostate"
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            "destination_key": modelKey,
            "ignored_cols_by_name": "ID",
            "learn_rate": 0.1,
            "ntrees": "2,4",
            "max_depth": 8,
            "min_rows": 1,
            "response": "CAPSULE",
            "classification": 1 if DO_CLASSIFICATION else 0,
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
        print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
        # no pattern waits for all
        h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMFirstResult["job_key"], destination_key=modelKey)
        print h2o.dump_json(gbmGridView)

        if 1 == 0:
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView["gbm_model"]["errs"][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView["gbm_model"]["cm"]
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"])
Пример #17
0
    def test_GBMGrid_basic_prostate(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': 'c(2,4)',
            'max_depth': 8,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
        print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
        # no pattern waits for all
        h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMFirstResult['job_key'], destination_key=modelKey)
        print h2o.dump_json(gbmGridView)

        if 1==0:
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
    def test_GBM_ec2_airlines(self):

        h2o.beta_features = False

        files = [
                 ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, 
                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            for depth in [5,15]:
                params = {
                    'destination_key': "GBMKEY",
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': 10,
                    'max_depth': depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True
                start = time.time()
                print "Start time is: ", time.time()
                #noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs)
                h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800)
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
Пример #19
0
    def test_GBM_ec2_airlines(self):

        h2o.beta_features = False

        files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            h2o.beta_features = False  # turn off beta_features
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "parse result:", parseResult["destination_key"]

            # GBM (train)****************************************
            # passes 5, fails 15
            # for depth in [5,15,25,40]:
            for depth in [5, 5, 5, 5, 5]:
                params = {
                    "destination_key": "GBMKEY",
                    "learn_rate": 0.2,
                    "nbins": 1024,
                    "ntrees": 10,
                    "max_depth": depth,
                    "min_rows": 10,
                    "response": response,
                    "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed",
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True
                start = time.time()
                print "Start time is: ", time.time()
                # noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs)
                h2j.pollWaitJobs(pattern="GBMKEY", timeoutSecs=1800, pollTimeoutSecs=1800)
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
Пример #20
0
    def test_GBM_mnist_restart_many(self):
        importFolderPath = "mnist"
        csvFilename = "train.csv.gz"
        timeoutSecs=1800
        trialStart = time.time()

        for trial in range(10):
            # PARSE train****************************************
            trainKey = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + csvFilename, schema='put',
                hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            params = { 
                'destination_key': "GBMKEY",
                'learn_rate': .1,
                'ntrees': 10,
                'max_depth': 8,
                'min_rows': 1,
                'response': 784, # this dataset has the response in the last col (0-9 to check)
                # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
                }

            kwargs = params.copy()
            h2o.beta_features = True
            timeoutSecs = 1800
            #noPoll -> False when GBM finished
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
            h2o.beta_features = False
            # if it fails, should happen within 8 secs
            time.sleep(8)
            h2j.cancelAllJobs()
            h2o.check_sandbox_for_errors()
            print "Trial %s: GBM start didn't have any errors after 8 seconds. cancelled. Will delete all keys now." % trial

            if DO_DELETE_KEYS_AND_CAUSE_PROBLEM:
                h2i.delete_keys_at_all_nodes()
Пример #21
0
    def test_GBM_mnist(self):
        importFolderPath = "mnist"
        csvFilename = "train.csv.gz"
        timeoutSecs=1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + csvFilename, schema='put',
            hex_key=trainKey, timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        # GBM (train)****************************************
        params = { 
            'destination_key': "GBMKEY",
            'learn_rate': .1,
            'ntrees': 10,
            'max_depth': 8,
            'min_rows': 1,
            'response': 784, # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
            }

        kwargs = params.copy()
        h2o.beta_features = True
        timeoutSecs = 1800
        #noPoll -> False when GBM finished
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
        # hack!
        if h2o.beta_features:
            h2o_jobs.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Пример #22
0
    def test_GBMScore(self):
        h2o.beta_features = False
        importFolderPath = 'standard'
        csvTrainPath = importFolderPath + '/allyears2k.csv'
        csvTestPath = csvTrainPath
#        importFolderPath = 'newairlines'
#        csvTrainPath = importFolderPath + '/train/*train*'
#        csvTestPath  = importFolderPath + '/train/*test*'
        trainhex = 'train.hex'
        testhex = 'test.hex'
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path = csvTrainPath, schema = 'local', hex_key = trainhex, timeoutSecs = 2400, doSummary = False)
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path = csvTestPath, schema = 'local', hex_key = testhex, timeoutSecs = 2400, doSummary = False)
        inspect_test   = h2o.nodes[0].inspect(testhex, timeoutSecs=8000)
        h2o.beta_features = True
        response = 'IsDepDelayed'
        ignored_cols = 'DepTime,ArrTime,FlightNum,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'


        params   =  {'destination_key'      : 'GBMScore',
                     'response'             : response,
                     'ignored_cols_by_name' : ignored_cols,
                     'classification'       : 1,
                     'validation'           : None,
                     'ntrees'               : 100,
                     'max_depth'            : 10,
                     'learn_rate'           : 0.00005,
                    }

        parseResult = {'destination_key' : trainhex}
        kwargs    = params.copy()
        gbm = h2o_cmd.runGBM(parseResult = parseResult, timeoutSecs=4800, **kwargs)

        scoreStart = time.time()
        h2o.nodes[0].generate_predictions(model_key = 'GBMScore', data_key = trainhex)
        scoreElapsed = time.time() - scoreStart

        print "It took ", scoreElapsed, " seconds to score ", inspect_test['num_rows'], " rows. Using a GBM with 100 10-deep trees."
        print "That's ", 1.0*scoreElapsed / 100.0  ," seconds per 10-deep tree."
Пример #23
0
    def test_rf_enums_mappings(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            # (n, 1, 'cD', 300),
            # (n, 2, 'cE', 300),
            # (n, 3, 'cF', 300),
            # (n, 4, 'cG', 300),
            # (n, 5, 'cH', 300),
            # (n, 6, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
        ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(csvPathname,
                                         enumList,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_TRAIN)

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(csvScorePathname,
                                         enumListForScore,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_SCORE)
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            # use mtries so both look at all cols at every split? doesn't matter for speedrf
            # does speedrf try one more time? with 3 cols, mtries=2, so another try might
            # get a look at the missing col
            # does matter for drf2. does it "just stop"
            # trying mtries always looking at all columns or 1 col might be interesting
            if SPEEDRF:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'ntrees': 1,
                    'max_depth': 100,
                    # 'oobee': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'mtries': COLS,
                }
            elif GBM:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'validation': scoreDataKey,
                    'seed': 123456789,
                    # 'learn_rate': .1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'classification': 1,
                }
            else:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'classification': 1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'nbins': 1024,
                    'mtries': COLS,
                }

            for r in range(2):
                start = time.time()

                if GBM:
                    gbmResult = h2o_cmd.runGBM(parseResult=parseResult,
                                               timeoutSecs=timeoutSecs,
                                               pollTimeoutSecs=180,
                                               **kwargs)

                    print "gbm end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    # print h2o.dump_json(gbmResult)
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult)

                elif SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                                  timeoutSecs=timeoutSecs,
                                                  pollTimeoutSecs=180,
                                                  **kwargs)
                    print "speedrf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                else:
                    rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             pollTimeoutSecs=180,
                                             **kwargs)
                    print "rf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                h2o_cmd.runScore(dataKey=scoreDataKey,
                                 modelKey=modelKey,
                                 vactual=y,
                                 vpredict=1,
                                 doAUC=not MULTINOMIAL)  # , expectedAuc=0.5)

                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
Пример #24
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz',
                 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex'
                 )
            ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800,
                 None, 'file_1[0-9].dat.gz', 'file_10_test.hex')
            ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            h2o.beta_features = False  #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema='s3n',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                noPoll=h2o.beta_features,
                                                doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500)

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               noPoll=h2o.beta_features,
                                               doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            print "Slow! exec is converting all imported keys?, not just what was parsed"
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey,
                                                           testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378
            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            for max_depth in [5, 10, 20, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    # 'ignored_cols':
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                noPoll=True,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                if doPredict:
                    predictKey = 'Predict.hex'
                    ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                    start = time.time()
                    gbmTestResult = h2o_cmd.runPredict(
                        data_key=parseTestResult['destination_key'],
                        model_key=modelKey,
                        destination_key=predictKey,
                        timeoutSecs=timeoutSecs)
                    # hack
                    if h2o.beta_features:
                        h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                         pollTimeoutSecs=timeoutSecs)
                    elapsed = time.time() - start
                    print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                    print "This is crazy!"
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual=response,
                        predict=predictKey,
                        vpredict='predict',  # choices are 0 and 'predict'
                    )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm)
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                    # xList.append(ntrees)
                    xList.append(max_depth)
                    eList.append(pctWrong)
                    fList.append(trainElapsed)

            h2o.beta_features = False

            if doPredict:
                xLabel = 'max_depth'
                eLabel = 'pctWrong'
                fLabel = 'trainElapsed'
                eListTitle = ""
                fListTitle = ""
                h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                                  fListTitle, fList, fLabel)
Пример #25
0
    def test_GBM_manyfiles_multijob(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None,
                 'file_1.dat.gz', 'test.hex')
            ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800,
                 None, 'file_1[0-9].dat.gz', 'test.hex')
            ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema='local',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey)
            # inc by 1 for R col
            # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate
            # only a problem if they share the dataset, do classification with integers.
            # change to factor here, to avoid the problem
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            if not DO_FAIL:
                execExpr += "; factor(%s[, 378+1]);" % (trainKey)

            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            csvPathname = importFolderPath + "/" + testFilename
            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            # plus 1 for R indexing
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            if not DO_FAIL:
                execExpr += "; factor(%s[, 378+1]);" % (testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            # add 1 for start-with-1
            ignored_cols_by_name = ",".join(
                map(lambda x: "C" + str(x + 1), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % 'C' + str(
                response + 1)

            ntrees = 10
            trial = 0
            # ignore 200 random cols (not the response)
            print "Kicking off multiple GBM jobs at once"
            # GBM train****************************************
            if DO_FAIL:
                cases = [5, 10, 20, 40]
            else:
                cases = [5, 10, 20]

            for max_depth in cases:
                trial += 1

                params = {
                    'response': "C" + str(response + 1),
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'validation': parseTestResult['destination_key'],
                    'ignored_cols_by_name': ignored_cols_by_name,
                    'grid_parallelism': 1,
                    'classification': 1 if DO_CLASSIFICATION else 0,
                }

                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                trainStart = time.time()
                # can take 4 times as long with 4 jobs?
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                noPoll=True,
                                                timeoutSecs=timeoutSecs * 4,
                                                destination_key=modelKey +
                                                "_" + str(trial),
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename

            statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                              pollTimeoutSecs=timeoutSecs,
                                              retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                             pollTimeoutSecs=timeoutSecs)
Пример #26
0
    def test_GBM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if localhost:
            tryList = [
                (100000, 400, 'cA', 300), 
                ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300), 
                # (10000, 50, 'cC', 300), 
                (100000, 100, 'cD', 300), 
                (100000, 200, 'cE', 300), 
                (100000, 500, 'cG', 300), 
                (100000, 1000, 'cI', 300), 
                ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE train****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            h2o.beta_features = False
            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            # l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                # len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            l = '{:d} jvms, {:d}MB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            # was failing with 100 trees
            # ntrees = 100
            # for max_depth in [5,10,20,40]:
            ntrees = 10
            for max_depth in [5]:
                params = {
                    'learn_rate': .2,
                    'nbins': 10, # 1024 fail
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': num_cols-1,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT_IF_KEVIN:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #27
0
    def test_GBM_regression_rand2(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
            # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
            ('standard', 'covtype.shuffled.10pct.sorted.data',
             'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data',
             'covtype.test.hex')
        ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=importFolderPath + "/" +
                                                trainFilename,
                                                schema='local',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", trainKey

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", testKey

            paramsDict = define_gbm_params()
            for trial in range(3):
                # use this to set any defaults you want if the pick doesn't set
                print "Regression!"
                params = {
                    'response': 'C55',
                    # 'ignored_cols_by_name': 'C5,C6,C7,C8,C9',
                    'ntrees': 2,
                    'classification': 0,
                    'validation': testKey,
                }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                print "gbmTrainView:", h2o.dump_json(gbmTrainView)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                # for regression, the cms are all null, so don't print

                # GBM test****************************************
                predictKey = 'Predict.hex'
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(data_key=testKey,
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename
                print "FIX! where do we get the summary info on the test data after predict?"
Пример #28
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if h2o.localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex')
                ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            print "Slow! exec is converting all imported keys?, not just what was parsed"
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378
            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    # 'ignored_cols': 
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                if doPredict:
                    predictKey = 'Predict.hex'
                    ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                    start = time.time()
                    gbmTestResult = h2o_cmd.runPredict(
                        data_key=parseTestResult['destination_key'], 
                        model_key=modelKey,
                        destination_key=predictKey,
                        timeoutSecs=timeoutSecs)
                    elapsed = time.time() - start
                    print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                    print "This is crazy!"
                    gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual=response,
                        predict=predictKey,
                        vpredict='predict', # choices are 0 and 'predict'
                        )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                    # xList.append(ntrees)
                    xList.append(max_depth)
                    eList.append(pctWrong)
                    fList.append(trainElapsed)


            if doPredict:
                xLabel = 'max_depth'
                eLabel = 'pctWrong'
                fLabel = 'trainElapsed'
                eListTitle = ""
                fListTitle = ""
                h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #29
0
    def test_GBM_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilename = "mnist_training.csv.gz"
        timeoutSecs = 1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=importFolderPath + "/" +
                                       csvFilename,
                                       schema='put',
                                       hex_key=trainKey,
                                       timeoutSecs=timeoutSecs)

        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        # GBM (train)****************************************
        modelKey = "GBM_model"
        params = {
            'classification': 1,  # faster? 
            'destination_key': modelKey,
            'learn_rate': .1,
            'ntrees': 3,
            'max_depth': 8,
            'min_rows': 1,
            'response':
            0,  # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        #noPoll -> False when GBM finished
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                        noPoll=True,
                                        **kwargs)
        h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200,
                                    pollTimeoutSecs=120,
                                    retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cms = gbmTrainView['gbm_model']['cms']
            cm = cms[-1]['_arr']  # use the last one
            print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr']
            print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr']
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(
                gbmTrainView['gbm_model']['errs'])
Пример #30
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey)
            resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey)
            resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(num_cols)
            del x[response]
            ignored_cols_by_name = ",".join(map(str,random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            


                if FORCE_FAIL_CASE:
                    params = {'learn_rate': 0.2, 'classification': None, 'min_rows': 10, 'ntrees': 10, 'response': 378, 'nbins': 1024, 'ignored_cols_by_name': '256, 382, 399, 50, 176, 407, 375, 113, 170, 313, 364, 33, 361, 426, 121, 371, 232, 327, 480, 75, 37, 312, 225, 195, 244, 406, 268, 230, 321, 257, 274, 197, 35, 501, 360, 72, 213, 79, 1, 466, 362, 160, 444, 437, 5, 59, 108, 454, 73, 374, 509, 337, 183, 252, 21, 314, 100, 200, 159, 379, 405, 367, 432, 181, 8, 420, 118, 284, 281, 465, 456, 359, 291, 330, 258, 523, 243, 487, 408, 392, 15, 231, 482, 481, 70, 171, 182, 31, 409, 492, 471, 53, 45, 448, 83, 527, 452, 350, 423, 93, 447, 130, 126, 54, 354, 169, 253, 49, 42, 431, 305, 498, 216, 189, 508, 122, 308, 228, 190, 293, 451, 63, 133, 304, 397, 425, 333, 19, 158, 391, 153, 282, 112, 64, 502, 7, 16, 469, 163, 136, 40, 99, 302, 264, 325, 434, 187, 311, 286, 278, 179, 109, 348, 287, 467, 400, 164, 384, 422, 43, 117, 91, 276, 211, 175, 329, 541, 438, 145, 534, 218, 177, 317, 222, 210, 162, 402, 98, 299, 245, 385, 233, 188, 516, 143, 13, 532, 429, 172, 455, 470, 518, 236, 296, 388, 468, 110, 395, 185, 25, 489, 196, 120, 435, 165, 168, 271, 74, 510, 36, 76, 208, 223, 270, 515, 421, 87, 66, 473, 220, 46, 486, 102, 38, 156, 48, 132, 331, 51, 403, 234, 23, 449, 341, 303, 410, 479, 203, 413, 512, 513, 9, 446, 511, 55, 6, 339, 418, 476, 178, 266, 22, 141, 259, 349, 86, 144, 34, 290, 326, 318, 519, 424, 127, 174, 472, 116, 17, 152, 280, 215, 514, 103, 377, 537, 373, 238, 47, 353, 428, 94, 214, 61, 123, 386, 351, 246, 411, 101, 249, 240, 520, 307, 288, 199, 147, 436, 77, 464, 414', 'source': u'test.hex', 'validation': u'test.hex', 'max_depth': 5} 

                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                print "This is crazy!"
                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #31
0
    def test_GBM_cancel_model_reuse(self):
        h2o.beta_features = True
        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameAll = [
            # have to use col name for response?
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            # ("standard", "covtype.data", 54),
            # ("standard", "covtype20x.data", 54),
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            print "FIX! is this guy getting cancelled because he's reusing a key name? but it should be okay?"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', 
                timeoutSecs=50)
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', 
                timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json

            h2o.check_sandbox_for_errors()

            # wait for it to show up in jobs?
            ## time.sleep(2)
            # no pattern waits for all
            ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            # print "\nparseResult", h2o.dump_json(parseResult)
            print "Parse result['destination_key']:", parseResult['destination_key']
            ## What's wrong here? too big?
            ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True)

            h2o.check_sandbox_for_errors()

            # have to avoid this on nflx data. colswap with exec
            # Exception: rjson error in gbm: Argument 'response' error: 
            # Only integer or enum/factor columns can be classified

            if DO_CLASSIFICATION:
                # need to flip the right col! (R wise)
                execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1)
                kwargs = { 'str': execExpr }
                resultExec = h2o_cmd.runExec(**kwargs)

            # lets look at the response column now
            s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1)
            # x = range(542)
            # remove the output too! (378)
            ignoreIndex = [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]
            # have to add 1 for col start with 1, now. plus the C
            xIgnore = ",".join(["C" + str(i+1) for i in ignoreIndex])

            params = {
                'destination_key': None,
                'ignored_cols_by_name': xIgnore,
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': "C" + str(response+1),
                'classification': 1 if DO_CLASSIFICATION else 0,
                'grid_parallelism': 4,
                }

            kwargs = params.copy()
            timeoutSecs = 1800

            for i in range(5):
                # now issue a couple background GBM jobs that we'll kill
                jobids = []     
                for j in range(5):
                    # FIX! apparently we can't reuse a model key after a cancel
                    kwargs['destination_key'] = 'GBMBad' + str(j)
                    # rjson error in poll_url: Job was cancelled by user!
                    GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
                    jobids.append(GBMFirstResult['job_key'])
                    h2o.check_sandbox_for_errors()
                    
                # have to pass the job id
                # for j in jobids:
                #     h2o.nodes[0].jobs_cancel(key=j)

                h2o_jobs.cancelAllJobs()
                # PUB-361. going to wait after cancel before reusing keys
                time.sleep(3)
                # am I getting a subsequent parse job cancelled?
                h2o_jobs.showAllJobs()

            if DELETE_KEYS:
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
Пример #32
0
    def test_GBM_poker_1m(self):
        h2o.beta_features = True
        for trial in range(2):
            # PARSE train****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'
            timeoutSecs = 900
            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            csvPathname = 'poker/poker-hand-testing.data'
            hex_key = 'poker-hand-testing.data.hex'
            parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)

            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            ntrees = 2
            for max_depth in [5,10,20]:
                params = {
                    'learn_rate': .1,
                    'nbins': 10,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': numCols-1,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT_IF_KEVIN:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #33
0
    def test_GBM_params_rand2(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
            # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
            ('standard', 'covtype.shuffled.10pct.sorted.data',
             'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data',
             'covtype.test.hex')
        ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=importFolderPath + "/" +
                                                trainFilename,
                                                schema='local',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(
                key=parseTestResult['destination_key'])
            paramsDict = define_gbm_params()
            for trial in range(3):
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # use this to set any defaults you want if the pick doesn't set
                params = {
                    'response': 54,
                    'ignored_cols_by_name': 'C1,C2,C3,C4,C5',
                    'ntrees': 2,
                    'validation': parseTestResult['destination_key'],
                }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'],
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                if DO_PREDICT_CM:
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual='predict',
                        predict=predictKey,
                        vpredict='predict',  # choices are 7 (now) and 'predict'
                    )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm)
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                if 'max_depth' in params and params['max_depth']:
                    xList.append(params['max_depth'])
                    eList.append(pctWrongTrain)
                    fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrongTrain'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Пример #34
0
    def test_c10_rel_gbm(self):
        h2o.beta_features = True
        print "not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Test***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        testFilename = 'classification1Test.txt'
        testPathname = importFolderPath + "/" + testFilename

        start = time.time()
        parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True)
        print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        trainFilename = 'classification1Train.txt'
        trainPathname = importFolderPath + "/" + trainFilename

        start = time.time()
        parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', 
            timeoutSecs=500, doSummary=True)
        print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, trainPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']
        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # GBM Train***********************************************************
        x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
        # response = 0
        # doesn't work if index is used?
        response = 'outcome'

        # x = range(inspect['num_cols'])
        # del x[response]
        ntrees = 10
        # fails with 40
        params = {
            'learn_rate': .2,
            'nbins': 1024,
            'ntrees': ntrees,
            'max_depth': 20,
            'min_rows': 2,
            'response': response,
            'cols': x,
            # 'ignored_cols_by_name': None,
        }
        print "Using these parameters for GBM: ", params
        kwargs = params.copy()
        modelKey = 'GBMModelKey'

        timeoutSecs = 900

        trainStart = time.time()
        gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
            timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
        trainElapsed = time.time() - trainStart
        print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        # errrs from end of list? is that the last tree?
        errsLast = gbmTrainView['gbm_model']['errs'][-1]
        print "GBM 'errsLast'", errsLast

        # get the last cm
        cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
        pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
        print "Last line of this cm might be NAs, not CM"
        print "\nTrain\n==========\n"
        print h2o_gbm.pp_cm(cm)

        # GBM test****************************************
        predictKey = 'Predict.hex'
        h2o_cmd.runInspect(key=parseTestResult['destination_key'])
        start = time.time()
        gbmTestResult = h2o_cmd.runPredict(
            data_key=parseTestResult['destination_key'],
            model_key=modelKey,
            destination_key=predictKey,
            timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename


        if DO_PREDICT_CM:
            gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                actual=parseTestResult['destination_key'],
                vactual='predict',
                predict=predictKey,
                vpredict='predict', # choices are 7 (now) and 'predict'
                )

            # errrs from end of list? is that the last tree?
            # all we get is cm
            cm = gbmPredictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "Last line of this cm is really NAs, not CM"
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)
Пример #35
0
    def test_GBM_with_cancels(self):
        print "do import/parse with VA"
        h2o.beta_features = False

        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameAll = [
            # have to use col name for response?
            # ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            ("standard", "covtype.data", 54),
            # ("standard", "covtype20x.data", 54),
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename

            ### h2o.beta_features = False

            (importResult,
             importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
                                              path=csvPathname,
                                              schema='local',
                                              timeoutSecs=50)
            parseResult = h2i.import_parse(
                bucket='home-0xdiag-datasets',
                path=csvPathname,
                schema='local',
                hex_key='c.hex',
                timeoutSecs=500,
                noPoll=False,
                doSummary=False
            )  # can't do summary until parse result is correct json

            h2o.check_sandbox_for_errors()

            # wait for it to show up in jobs?
            ## time.sleep(2)
            # no pattern waits for all
            ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

            # hack it because no response from Parse2
            if h2o.beta_features:
                parseResult = {'destination_key': 'c.hex'}

            print "\nparseResult", h2o.dump_json(parseResult)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            ## What's wrong here? too big?
            ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True)

            h2o.check_sandbox_for_errors()

            # have to avoid this on nflx data. colswap with exec
            # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified

            if importFolderPath == 'manyfiles-nflx-gz':
                if DO_CLASSIFICATION:
                    # need to flip the right col! (R wise)
                    execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response + 1,
                                                             response + 1)
                    kwargs = {'str': execExpr}
                    resultExec = h2o_cmd.runExec(**kwargs)

                # lets look at the response column now
                h2o.beta_features = True
                s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1)
                # x = range(542)
                # remove the output too! (378)
                xIgnore = []
                # BUG if you add unsorted 378 to end. remove for now
                for i in [
                        3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20,
                        424, 425, 426, 540, 541, response
                ]:
                    # have to add 1 for col start with 1, now. plus the C
                    xIgnore.append("C" + str(i + 1))
            else:
                # leave one col ignored, just to see?
                xIgnore = 'C1'

            modelKey = "GBMGood"
            params = {
                'destination_key': modelKey,
                'ignored_cols_by_name': xIgnore,
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': "C" + str(response + 1),
                'classification': 1 if DO_CLASSIFICATION else 0,
                'grid_parallelism': 4,
            }

            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                            noPoll=True,
                                            **kwargs)
            print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
            # no pattern waits for all

            for i in range(15):
                # now issue a couple background GBM jobs that we'll kill
                jobids = []
                for j in range(5):
                    # FIX! apparently we can't reuse a model key after a cancel
                    kwargs['destination_key'] = 'GBMBad' + str(i) + str(j)
                    GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                                    noPoll=True,
                                                    **kwargs)
                    jobids.append(GBMFirstResult['job_key'])

                # have to pass the job id
                for j in jobids:
                    h2o.nodes[0].jobs_cancel(key=j)

            h2o_jobs.pollWaitJobs(pattern='GBMGood',
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds."

            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(
                    gbmTrainView['gbm_model']['errs'])

            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_from_import_result(pattern=csvFilename,
                                                   importResult=importResult)
Пример #36
0
    def test_GBM_parseTrain(self):
        #folderpath, filename, keyname, timeout
        bucket = 'home-0xdiag-datasets'

        files = [('mnist', 'mnist_training.csv.gz', 'mnistsmalltrain.hex',
                  1800, 0)]

        grid = [[1, 10, 100, 1000], [0.0, 0.01, 0.001, 0.0001, 1], [1, 2],
                [1, 10, 100]]
        grid = list(itertools.product(*grid))
        grid = random.sample(grid,
                             10)  #don't do all 120, but get a random sample
        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=importFolderPath + "/" +
                                           csvFilename,
                                           schema='local',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']
            csv_header = ('nJVMs', 'java_heap_GB', 'dataset', 'ntrees',
                          'max_depth', 'learn_rate', 'min_rows', 'trainTime')
            for ntree, learn_rate, max_depth, min_rows in grid:
                if not os.path.exists('gbm_grid.csv'):
                    output = open('gbm_grid.csv', 'w')
                    output.write(','.join(csv_header) + '\n')
                else:
                    output = open('gbm_grid.csv', 'a')

                csvWrt = csv.DictWriter(output,
                                        fieldnames=csv_header,
                                        restval=None,
                                        dialect='excel',
                                        extrasaction='ignore',
                                        delimiter=',')
                java_heap_GB = h2o.nodes[0].java_heap_GB

                params = {
                    'destination_key': 'GBMKEY',
                    'learn_rate': learn_rate,
                    'ntrees': ntree,
                    'max_depth': max_depth,
                    'min_rows': min_rows,
                    'response': response
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                #noPoll -> False when GBM finished
                start = time.time()
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                           noPoll=True,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                h2j.pollWaitJobs(pattern="GBMKEY",
                                 timeoutSecs=3600,
                                 pollTimeoutSecs=3600)
                #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \
                #    "%f pct. of timeout" % (GBMResult['python_%timeout'])
                #print GBMResult
                GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                print GBMView['gbm_model']['errs']
                elapsed = time.time() - start
                row = {
                    'nJVMs': len(h2o.nodes),
                    'java_heap_GB': java_heap_GB,
                    'dataset': 'mnist_training.csv.gz',
                    'learn_rate': learn_rate,
                    'ntrees': ntree,
                    'max_depth': max_depth,
                    'min_rows': min_rows,
                    'trainTime': elapsed
                }
                print row
                csvWrt.writerow(row)
Пример #37
0
    def test_GBMGrid_basic_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=csvFilename + ".hex",
                                       schema='put')
        colNames = [
            'ID', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL',
            'GLEASON'
        ]

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': '4,100',
            'max_depth': 8,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                   noPoll=not DO_POLL,
                                   **kwargs)
        if not DO_POLL:
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)

            statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300,
                                                   pollTimeoutSecs=10,
                                                   retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            # shouldn't need this?
            h2o_jobs.pollWaitJobs(pattern=None,
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)

        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        # FIX! after gbm grid, have to get the model keys from the json?
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'],
                                                 destination_key=modelKey)
        print h2o.dump_json(gbmGridView)

        if 1 == 0:
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1]  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(
                    gbmTrainView['gbm_model']['errs'])
Пример #38
0
    def test_GBMGrid_basic_many(self):
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=csvFilename + ".hex",
                                       schema='put')
        colNames = [
            'ID', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL',
            'GLEASON'
        ]

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': '.1,.2',
            'ntrees': '8,10',
            'max_depth': '8,9',
            'min_rows': '1,2',
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            'grid_parallelism': 1,
        }

        kwargs = params.copy()
        timeoutSecs = 1800

        jobs = []
        # kick off 5 of these GBM grid jobs, with different tree choices
        start = time.time()
        totalGBMGridJobs = 0
        # for more in range(8):
        # fast
        # for more in range(9):
        for i in range(5):
            kwargs = params.copy()
            kwargs['min_rows'] = '1,2,3'
            if DO_FROM_TO_STEP:
                kwargs['max_depth'] = '5:10:1'
            else:
                kwargs['max_depth'] = '5,6,10'

            GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                       noPoll=True,
                                       **kwargs)
            # print "GBMResult:", h2o.dump_json(GBMResult)
            job_key = GBMResult['job_key']
            model_key = GBMResult['destination_key']
            jobs.append((job_key, model_key))
            totalGBMGridJobs += 1

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        for job_key, model_key in jobs:
            GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key,
                                                   destination_key=model_key)
            h2o_gbm.showGBMGridResults(GBMResult, 15)

        print "All GBM jobs completed in", elapsed, "seconds."
        print "totalGBMGridJobs:", totalGBMGridJobs
Пример #39
0
=======
                    # 'ignored_cols_by_name': None,
                    'importance': importance,
                    'group_split': group_split,
>>>>>>> 50f5b1b8c94b6ce7cd5ec175fecdca811f41487f
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
Пример #40
0
    def test_c9_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800,
                  'IsDepDelayed')]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            for depth in [5, 15]:
                params = {
                    'destination_key':
                    "GBMKEY",
                    'learn_rate':
                    .2,
                    'nbins':
                    1024,
                    'ntrees':
                    10,
                    'max_depth':
                    depth,
                    'min_rows':
                    10,
                    'response':
                    response,
                    'ignored_cols_by_name':
                    'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                timeoutSecs = 1800
                start = time.time()
                print "Start time is: ", time.time()
                #noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                           noPoll=True,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                                  pollTimeoutSecs=30,
                                                  retryDelaySecs=5)
                num_cpus = statMean['num_cpus'],
                my_cpu_pct = statMean['my_cpu_%'],
                sys_cpu_pct = statMean['sys_cpu_%'],
                system_load = statMean['system_load']
                # shouldn't need this?
                h2j.pollWaitJobs(pattern="GBMKEY",
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs,
                                 retryDelaySecs=5)
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                #print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Пример #41
0
    def test_GBM_sphere15_180GB(self):
        csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        totalBytes = 183538602156
        importFolderPath = "datasets/kmeans_big"
        csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([
                0.0, -113.00566692375459, -89.99595447985321,
                -455.9970643424373, 4732.0, 49791778.0, 36800.0
            ], 248846122, 1308149283316.2988),
            ([
                0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                25654042.00592703, 28304.0
            ], 276924291, 1800760152555.98),
            ([
                0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                31319.99486705394
            ], 235089554, 375419158808.3253),
            ([
                0.0, 10.0, -72.00113070337981, -171.0198611715457,
                4430.00952228909, 37007399.0, 29894.0
            ], 166180630, 525423632323.6474),
            ([
                0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                22865824.99639042, 5335.0
            ], 167234179, 1845362026223.1094),
            ([
                0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                -47537.998050740985
            ], 195420925, 197941282992.43475),
            ([
                0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289,
                1928.0, 39967190.0, 27202.0
            ], 214401768, 11868360232.658035),
            ([
                0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                30712.99115201907
            ], 258853406, 598863991074.3276),
            ([
                0.0, 21.0, 114.01584574295777, 242.99690338815898,
                1674.0029079209912, 33089556.0, 36415.0
            ], 190979054, 1505088759456.314),
            ([
                0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                -48473733.04122273, 47343.0
            ], 87794427, 1124697008162.3955),
            ([
                0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                16716.003410920028
            ], 78226988, 1151439441529.0215),
            ([
                0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                -14930.007919032574
            ], 167273589, 693036940951.0249),
            ([
                0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                11767.998552236539
            ], 148426180, 35942838893.32379),
            ([
                0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                -23336.998167498707
            ], 157533313, 88431531357.62982),
            ([
                0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008,
                2320.0, 46602185.0, 11212.0
            ], 118361306, 1111537045743.7646),
        ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            # hex_key = csvFilename + "_" + str(trial) + ".hex"
            hex_key = "C" + str(trial)
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=60,
                                           retryDelaySecs=2,
                                           benchmarkLogging=benchmarkLogging,
                                           **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            # GBM ****************************************
            if not DO_GBM:
                continue

            # make col 2 a binomial (negative numbers in src
            col = 2
            execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col,
                                                            hex_key, col)
            resultExec = h2o_cmd.runExec(str=execExpr)

            params = {
                'destination_key': "GBMKEY",
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': col  # should be binomial from above
            }

            kwargs = params.copy()
            timeoutSecs = 1800

            start = time.time()
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                       noPoll=True,
                                       **kwargs)
            # wait for it to show up in jobs?
            time.sleep(2)
            # no pattern waits for all
            h2o_jobs.pollWaitJobs(pattern=None,
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % (
                GBMResult['python_%timeout'])

            print "\nGBMResult:", GBMResult
            # print "\nGBMResult:", h2o.dump_json(GBMResult)

            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_at_all_nodes()
Пример #42
0
    def test_GBM_regression_rand2(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C54', 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", trainKey

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", testKey

            paramsDict = define_gbm_params()
            for trial in range(3):
                # use this to set any defaults you want if the pick doesn't set
                print "Regression!"
                params = {'response': 'C54', 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0}
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                # for regression, the cms are all null, so don't print

                # GBM test****************************************
                predictKey = 'Predict.hex'
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=testKey,
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename
                print "FIX! where do we get the summary info on the test data after predict?"
Пример #43
0
    def test_GBM_poker_1m(self):
        for trial in range(2):
            # PARSE train****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'
            timeoutSecs = 900
            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            csvPathname = 'poker/poker-hand-testing.data'
            hex_key = 'poker-hand-testing.data.hex'
            parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)

            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            ntrees = 2
            for max_depth in [5,10,20]:
                params = {
                    'learn_rate': .1,
                    'nbins': 10,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': num_cols-1,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT_IF_KEVIN:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #44
0
    def test_GBM_sphere15_180GB(self):
        csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            # hex_key = csvFilename + "_" + str(trial) + ".hex"
            hex_key = "C" + str(trial)
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            else:
                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # GBM ****************************************
            if not DO_GBM:
                continue

            # make col 2 a binomial (negative numbers in src
            col = 2
            execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col, hex_key, col)
            resultExec = h2o_cmd.runExec(str=execExpr)

            params = {
                'destination_key': "GBMKEY",
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': col # should be binomial from above
                }

            kwargs = params.copy()
            h2o.beta_features = True
            timeoutSecs = 1800

            start = time.time()
            GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
            # wait for it to show up in jobs?
            time.sleep(2)
            # no pattern waits for all
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % (GBMResult['python_%timeout'])

            print "\nGBMResult:", GBMResult
            # print "\nGBMResult:", h2o.dump_json(GBMResult)

            h2o.beta_features = False
            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_at_all_nodes()
Пример #45
0
    def test_GBM_many_cols_enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']

        tryList = [
            (10000,  100, 'cA', 300),
            (10000,  300, 'cB', 500),
            # (10000,  500, 'cC', 700),
            # (10000,  700, 'cD', 3600),
            # (10000,  900, 'cE', 3600),
            # (10000,  1000, 'cF', 3600),
            # (10000,  1300, 'cG', 3600),
            # (10000,  1700, 'cH', 3600),
            # (10000,  2000, 'cI', 3600),
            # (10000,  2500, 'cJ', 3600),
            # (10000,  3000, 'cK', 3600),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0,
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(numCols-1),
                    'ignored_cols_by_name': None,
                }
               # both response variants should work?
                # if random.randint(0,1):
                #    params['response'] = numCols-1,
                
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #46
0
    def test_GBM_with_cancels(self):

        print "Sets h2o.beta_features like -bf at command line"
        print "this will redirect import and parse to the 2 variants"
        h2o.beta_features = True

        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameAll = [
            # have to use col name for response?
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            # ("standard", "covtype.data", 54),
            # ("standard", "covtype20x.data", 54),
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            
            ### h2o.beta_features = False

            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50)
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', 
                timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json

            h2o.check_sandbox_for_errors()

            # wait for it to show up in jobs?
            ## time.sleep(2)
            # no pattern waits for all
            ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

            # hack it because no response from Parse2
            if h2o.beta_features:
                parseResult = {'destination_key': 'c.hex'}

            print "\nparseResult", h2o.dump_json(parseResult)

            print "Parse result['destination_key']:", parseResult['destination_key']
            ## What's wrong here? too big?
            ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True)

            h2o.check_sandbox_for_errors()

            # have to avoid this on nflx data. colswap with exec
            # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified

            if importFolderPath=='manyfiles-nflx-gz':
                if DO_CLASSIFICATION:
                    # need to flip the right col! (R wise)
                    execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1)
                    kwargs = { 'str': execExpr }
                    resultExec = h2o_cmd.runExec(**kwargs)

                # lets look at the response column now
                s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1)
                x = range(542)
                # remove the output too! (378)
                xIgnore = []
                # BUG if you add unsorted 378 to end. remove for now
                for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]:
                    if i not in x:
                        print "x:", x
                        print 'missing?', i
                    x.remove(i)
                    xIgnore.append(i)

                x = ",".join(map(str,x))
                def colIt(x): return "C" + str(x)
                xIgnore = ",".join(map(colIt, xIgnore))
            else:
                # leave one col ignored, just to see?
                xIgnore = 0

            modelKey = "GBMGood"
            params = {
                'destination_key': modelKey,
                'ignored_cols_by_name': xIgnore,
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': "C" + str(response),
                'classification': 1 if DO_CLASSIFICATION else 0,
                'grid_parallelism': 4,
                }

            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
            print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
            # no pattern waits for all

            for i in range(20):
                # now issue a couple background GBM jobs that we'll kill
                jobids = []     
                for j in range(5):
                    kwargs['destination_key'] = 'GBMBad' + str(j)
                    GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
                    jobids.append(GBMFirstResult['job_key'])

                # have to pass the job id
                for j in jobids:
                    h2o.nodes[0].jobs_cancel(key=j)


            h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds."

            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])

            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
Пример #47
0
def doGBM(f, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row):
    debug = False
    bench = "bench"
    if debug:
        print "Doing GBM DEBUG"
        bench = "bench/debug"
    #date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    overallWallStart = time.time()
    pre = ""
    if debug: pre    = 'DEBUG'
    gbmbenchcsv = 'benchmarks/'+build+'/'+pre+'gbmbench.csv'
    if not os.path.exists(gbmbenchcsv):
        output = open(gbmbenchcsv,'w')
        output.write(','.join(csv_header)+'\n')
    else:
        output = open(gbmbenchcsv,'a')
    csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
                    dialect='excel', extrasaction='ignore',delimiter=',')
    try:
        java_heap_GB = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']): 
            csvPathname = importFolderPath + "/" + f + '.csv'
        else: 
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        hex_key = f + '.hex'
        hK = folderPath + "Header.csv"    
        headerPathname = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey = h2i.find_key(hK)
        trainParseWallStart = time.time()
        h2o.beta_features = False #ensure this is false! 
        if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
        parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets',
                                       path             = csvPathname,
                                       schema           = 'local',
                                       hex_key          = hex_key,
                                       header           = 1,
                                       header_from_file = headerKey,
                                       separator        = 44,
                                       timeoutSecs      = 16000,
                                       retryDelaySecs   = 5,
                                       pollTimeoutSecs  = 16000,
                                       noPoll           = True,
                                       doSummary        = False
                                      )
        h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=16000, retryDelaySecs=5)
        parseWallTime = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime ," seconds." 
        h2o.beta_features = False #make sure false for the inspect as well!
        inspect_train  = h2o.nodes[0].inspect(hex_key, timeoutSecs=16000)
        inspect_test   = h2o.nodes[0].inspect(testFilehex, timeoutSecs=16000)
        h2o.beta_features = True #ok, can be true again
        nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row.update( {'h2o_build'          : build,
                     'nMachines'          : nMachines,
                     'nJVMs'              : len(h2o.nodes),
                     'Xmx/JVM'            : java_heap_GB,
                     'dataset'            : f,
                     'nTrainRows'         : inspect_train['num_rows'],
                     'nTestRows'          : inspect_test['num_rows'],
                     'nCols'              : inspect_train['num_cols'],
                     'trainParseWallTime' : parseWallTime,
                     'nTrees'             : ntrees,
                     'minRows'            : minrows,
                     'maxDepth'           : depth,
                     'learnRate'          : learnRate,
                     'classification'     : classification,
                    })
    
        params   =  {'destination_key'      : 'GBM('+f+')',
                     'response'             : response,
                     'ignored_cols_by_name' : ignored_cols,
                     'classification'       : classification,
                     'validation'           : testFilehex,
                     'ntrees'               : ntrees,
                     'max_depth'            : depth,
                     'min_rows'             : minrows,
                     'nbins'                : nbins,
                     'learn_rate'           : learnRate,
                    }
    
        parseResult = {'destination_key' : hex_key}
        kwargs    = params.copy()
        gbmStart  = time.time()
        #TODO(spencer): Uses jobs to poll for gbm completion
        gbm       = h2o_cmd.runGBM(parseResult = parseResult, noPoll=True, timeoutSecs=4800, **kwargs)
        h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=120, retryDelaySecs=5)
        gbmTime   = time.time() - gbmStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        row.update( {'gbmBuildTime'       : gbmTime,
                    })
        gbmTrainView = h2o_cmd.runGBMView(model_key='GBM('+f+')')
        if classification:
            cm = gbmTrainView['gbm_model']['cm']
            err = 1.0*(cm[0][1] + cm[1][0]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
        else:
            err = gbmTrainView['gbm_model']['errs'][-1]
        row.update({'Error' : err})
        csvWrt.writerow(row)
    finally:
        output.close()
Пример #48
0
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees,
          depth, minrows, nbins, learnRate, response, row):
    bench = "bench"
    if debug:
        print "Doing GBM DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    for f in fs['train']:
        overallWallStart = time.time()
        pre = ""
        if debug: pre = 'DEBUG'
        gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv'
        if not os.path.exists(gbmbenchcsv):
            output = open(gbmbenchcsv, 'w')
            output.write(','.join(csv_header) + '\n')
        else:
            output = open(gbmbenchcsv, 'a')
        csvWrt = csv.DictWriter(output,
                                fieldnames=csv_header,
                                restval=None,
                                dialect='excel',
                                extrasaction='ignore',
                                delimiter=',')
        try:
            java_heap_GB = h2o.nodes[0].java_heap_GB
            importFolderPath = bench + folderPath
            if (f in [
                    'AirlinesTrain1x', 'AllBedroomsTrain1x',
                    'AllBedroomsTrain10x', 'AllBedroomsTrain100x',
                    'CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x'
            ]):
                csvPathname = importFolderPath + "/" + f + '.csv'
            else:
                csvPathname = importFolderPath + "/" + f + "/*linked*"
            hex_key = f + '.hex'
            hK = folderPath + "Header.csv"
            headerPathname = importFolderPath + "/" + hK
            h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
            headerKey = h2i.find_key(hK)
            trainParseWallStart = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           header=1,
                                           header_from_file=headerKey,
                                           separator=44,
                                           timeoutSecs=7200,
                                           retryDelaySecs=5,
                                           pollTimeoutSecs=7200)

            parseWallTime = time.time() - trainParseWallStart
            print "Parsing training file took ", parseWallTime, " seconds."

            inspect_train = h2o.nodes[0].inspect(
                parseResult['destination_key'])
            inspect_test = h2o.nodes[0].inspect(testFilehex)

            nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(
                h2o_hosts.hosts)
            row.update({
                'h2o_build': build,
                'nMachines': nMachines,
                'nJVMs': len(h2o.nodes),
                'Xmx/JVM': java_heap_GB,
                'dataset': f,
                'nTrainRows': inspect_train['numRows'],
                'nTestRows': inspect_test['numRows'],
                'nCols': inspect_train['numCols'],
                'trainParseWallTime': parseWallTime,
                'classification': classification,
            })

            params = {
                'destination_key': 'GBM(' + f + ')',
                'response': response,
                'ignored_cols_by_name': ignored_cols,
                'classification': classification,
                'validation': testFilehex,
                'ntrees': ntrees,
                'max_depth': depth,
                'min_rows': minrows,
                'nbins': nbins,
                'learn_rate': learnRate,
            }

            kwargs = params.copy()
            gbmStart = time.time()
            #TODO(spencer): Uses jobs to poll for gbm completion
            h2o.beta_features = True
            gbm = h2o_cmd.runGBM(parseResult=parseResult,
                                 noPoll=True,
                                 timeoutSecs=4800,
                                 **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=7200,
                                  pollTimeoutSecs=120,
                                  retryDelaySecs=5)
            h2o.beta_features = False
            gbmTime = time.time() - gbmStart
            row.update({
                'gbmBuildTime': gbmTime,
            })
            #TODO(spencer): Add in gbm scoring
            #gbmScoreStart = time.time()
            #gbmScore      = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
            #scoreTime     = time.time() - gbmScoreStart
            csvWrt.writerow(row)
        finally:
            output.close()
Пример #49
0
    def test_GBM_manyfiles_train_test(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x+1), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % "C" + str(response+1)

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(response+1),
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            



                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual='C' + str(response+1),
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #50
0
    def test_GBM_covtype_train_test(self):
        h2o.beta_features = False
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        # h2b.browseTheCloud()

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key'])
            ntrees = 2
            # fails with 40
            for max_depth in [40, 5]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                h2o.beta_features = True
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict', # choices are 7 (now) and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #51
0
    def test_GBM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if h2o.localhost:
            tryList = [
                (10000, 100, 'cA', 300), 
                ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300), 
                # (10000, 50, 'cC', 300), 
                (10000, 100, 'cD', 300), 
                (10000, 200, 'cE', 300), 
                (10000, 300, 'cF', 300), 
                (10000, 400, 'cG', 300), 
                (10000, 500, 'cH', 300), 
                (10000, 1000, 'cI', 300), 
                ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            hdrFilename = 'hdr_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'

            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)


            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, 
                doSummary=False)
            # hack

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']


            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            ntrees = 5 
            prefixList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
            # for max_depth in [5,10,20,40]:
            for max_depth in [5, 10, 20]:

                # PARSE a new header****************************************
                print "Creating new header", hdrPathname
                prefix = prefixList.pop(0)
                write_syn_header(hdrPathname, rowCount, colCount, prefix)

                # upload and parse the header to a hex

                hdr_hex_key = prefix + "_hdr.hex"
                parseHdrResult = h2i.import_parse(bucket=None, path=hdrPathname, schema='put',
                    header=1, # REQUIRED! otherwise will interpret as enums
                    hex_key=hdr_hex_key, timeoutSecs=timeoutSecs, doSummary=False)
                # Set Column Names (before autoframe is created)
                h2o.nodes[0].set_column_names(source=hex_key, copy_from=hdr_hex_key)

                # GBM
                print "response col name is changing each iteration: parsing a new header"
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': prefix + "_response",
                    'ignored_cols_by_name': None,
                }

                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

                # works if you delete the autoframe
                ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe')

        # just plot the last one
        if DO_PLOT:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_GBM_manyfiles_train_test(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            # response = 378
            response = 'C379'

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(response),
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            



                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual='C' + str(response),
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #53
0
    def test_GBMGrid_basic_benign(self):
        h2o.beta_features = True
        csvFilename = "benign.csv"
        print "\nStarting", csvFilename
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=csvFilename + ".hex",
                                       schema='put')
        # columns start at 0
        # cols 0-13. 3 is output
        # no member id in this one

        # fails with n_folds
        print "Not doing n_folds with benign. Fails with 'unable to solve?'"
        # check the first in the models list. It should be the best
        colNames = [
            'STR', 'OBS', 'AGMT', 'FNDX', 'HIGD', 'DEG', 'CHK', 'AGP1', 'AGMN',
            'NLV', 'LIV', 'WT', 'AGLP', 'MST'
        ]
        modelKey = 'GBMGrid_benign'

        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'STR',
            'learn_rate': '.1,.2',
            'ntrees': 2,
            'max_depth': 8,
            'min_rows': 1,
            'response': 'FNDX',
            'classification': 1 if DO_CLASSIFICATION else 0,
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                   noPoll=not DO_POLL,
                                   **kwargs)
        if not DO_POLL:
            # no pattern waits for all
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)
            h2o_jobs.pollWaitJobs(pattern=None,
                                  timeoutSecs=300,
                                  pollTimeoutSecs=10,
                                  retryDelaySecs=5)
        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'],
                                                 destination_key=modelKey)

        if 1 == 0:
            # FIX! get model?
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1]  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(
                    gbmTrainView['gbm_model']['errs'])