def test_rf_covtype_train_full_fvec(self):
        h2o.beta_features = True
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", 
            timeoutSecs=180)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = kwargs['ntrees'] * 60
            start = time.time()
            print "Note train.csv is used for both train and validation"
            rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
            h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            job_key = rfv['job_key']
            model_key = rfv['destination_key']
            rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], 
                model_key=model_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=1, print_params=True)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv)
            self.assertLess(classification_error, 3, "train.csv should have full classification error: %s < 3" % classification_error)

            print "Trial #", trial, "completed"
    def test_rf_covtype_train_full_fvec(self):
        h2o.beta_features = True
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", 
            timeoutSecs=180)

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = kwargs['ntrees'] * 60
            start = time.time()
            print "Note train.csv is used for both train and validation"
            rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
            h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            job_key = rfv['job_key']
            model_key = rfv['destination_key']
            rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], 
                model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv)
            # hmm..just using defaults above in RF?
            self.assertLess(classification_error, 4.8, "train.csv should have full classification error: %s < 4.8" % classification_error)

            print "Trial #", trial, "completed"
Exemplo n.º 3
0
 def completionHack(jobKey, modelKey):
     if DO_POLL: # not needed
         pass
     else: 
         h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
     # print "FIX! how do we get the GLM result"
     params = {'_modelKey': modelKey}
     a = h2o.nodes[0].completion_redirect(jsonRequest="2/GLMModelView.json", params=params)
Exemplo n.º 4
0
 def completionHack(jobKey, modelKey):
     if DO_POLL: # not needed
         pass
     else: 
         h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
     # print "FIX! how do we get the GLM result"
     params = {'_modelKey': modelKey}
     a = h2o.nodes[0].completion_redirect(jsonRequest="2/GLMModelView.json", params=params)
Exemplo n.º 5
0
    def test_GBM_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilename = "mnist_training.csv.gz"
        timeoutSecs = 1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(
            bucket="home-0xdiag-datasets",
            path=importFolderPath + "/" + csvFilename,
            schema="put",
            hex_key=trainKey,
            timeoutSecs=timeoutSecs,
        )

        elapsed = time.time() - start
        print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
            (elapsed * 100) / timeoutSecs
        )
        print "parse result:", parseResult["destination_key"]

        # GBM (train)****************************************
        modelKey = "GBM_model"
        params = {
            "classification": 1,  # faster?
            "destination_key": modelKey,
            "learn_rate": 0.1,
            "ntrees": 3,
            "max_depth": 8,
            "min_rows": 1,
            "response": 0,  # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        # noPoll -> False when GBM finished
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
        h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs)

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView["gbm_model"]["errs"][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cm = gbmTrainView["gbm_model"]["cms"][-1]  # use the last one
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"])
Exemplo n.º 6
0
    def test_GBM_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilename = "mnist_training.csv.gz"
        timeoutSecs=1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',  path=importFolderPath + "/" + csvFilename, schema='put',
            hex_key=trainKey, timeoutSecs=timeoutSecs)

        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        # GBM (train)****************************************
        modelKey = "GBM_model"
        params = { 
            'classification': 1, # faster? 
            'destination_key': modelKey,
            'learn_rate': .1,
            'ntrees': 3,
            'max_depth': 8,
            'min_rows': 1,
            'response': 0, # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        #noPoll -> False when GBM finished
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
        h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cms = gbmTrainView['gbm_model']['cms']
            cm = cms[-1]['_arr'] # use the last one
            print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr']
            print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr']
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Exemplo n.º 7
0
    def test_GBMGrid_basic_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': '4,100',
            'max_depth': 8,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)

            statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            # shouldn't need this?
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        # FIX! after gbm grid, have to get the model keys from the json?
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey)
        print h2o.dump_json(gbmGridView)

        if 1==0:
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Exemplo n.º 8
0
    def test_GBMGrid_basic_prostate(self):
        h2o.beta_features = True
        csvFilename = "prostate.csv"
        print "\nStarting", csvFilename
        # columns start at 0
        csvPathname = 'logreg/' + csvFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put')
        colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON']

        modelKey = 'GBMGrid_prostate'
        # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive
        params = {
            'destination_key': modelKey,
            'ignored_cols_by_name': 'ID',
            'learn_rate': .1,
            'ntrees': '4,100',
            'max_depth': 8,
            'min_rows': 1,
            'response': 'CAPSULE',
            'classification': 1 if DO_CLASSIFICATION else 0,
            }

        kwargs = params.copy()
        timeoutSecs = 1800
        start = time.time()
        GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs)
        if not DO_POLL:
            print "\nfirst GBMResult:", h2o.dump_json(GBMResult)

            statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            # shouldn't need this?
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        elapsed = time.time() - start
        print "GBM training completed in", elapsed, "seconds."

        # FIX! after gbm grid, have to get the model keys from the json?
        gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey)
        print h2o.dump_json(gbmGridView)

        if 1==0:
            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
Exemplo n.º 9
0
    def test_c9b_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [
                 ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
                ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, 
                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            # passes 5, fails 15
            # for depth in [5,15,25,40]:
            for depth in [5,5,5,5,5]:
                params = {
                    'destination_key': "GBMKEY",
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': 10,
                    'max_depth': depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                start = time.time()
                print "Start time is: ", time.time()
                #noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs)

                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
                num_cpus = statMean['num_cpus'],
                my_cpu_pct = statMean['my_cpu_%'],
                sys_cpu_pct = statMean['sys_cpu_%'],
                system_load = statMean['system_load']
                # shouldn't need this?
                h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)

                h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800)
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                #print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Exemplo n.º 10
0
    def test_c9_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "parse result:", parseResult["destination_key"]

            # GBM (train)****************************************
            for depth in [5, 15]:
                params = {
                    "destination_key": "GBMKEY",
                    "learn_rate": 0.2,
                    "nbins": 1024,
                    "ntrees": 10,
                    "max_depth": depth,
                    "min_rows": 10,
                    "response": response,
                    "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed",
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                timeoutSecs = 1800
                start = time.time()
                print "Start time is: ", time.time()
                # noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs)
                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
                num_cpus = (statMean["num_cpus"],)
                my_cpu_pct = (statMean["my_cpu_%"],)
                sys_cpu_pct = (statMean["sys_cpu_%"],)
                system_load = statMean["system_load"]
                # shouldn't need this?
                h2j.pollWaitJobs(
                    pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5
                )
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                # GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                # print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
    def test_rf_covtype_train_full_fvec(self):
        h2o.beta_features = True
        csvFilename = "covtype.data"
        csvPathname = "standard/" + csvFilename
        parseResult = h2i.import_parse(
            bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=180
        )

        for trial in range(1):
            # params is mutable. This is default.
            kwargs = paramDict
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            timeoutSecs = kwargs["ntrees"] * 60
            start = time.time()
            print "Note train.csv is used for both train and validation"
            rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
            h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )

            job_key = rfv["job_key"]
            model_key = rfv["destination_key"]
            rfv = h2o_cmd.runRFView(
                data_key=parseResult["destination_key"], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1
            )

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv)
            # hmm..just using defaults above in RF?
            self.assertLess(
                classification_error,
                4.8,
                "train.csv should have full classification error: %s < 4.8" % classification_error,
            )

            print "Trial #", trial, "completed"
Exemplo n.º 12
0
    def test_GBM_manyfiles_multijob(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None,
                 'file_1.dat.gz', 'test.hex')
            ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800,
                 None, 'file_1[0-9].dat.gz', 'test.hex')
            ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema='local',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey)
            # inc by 1 for R col
            # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate
            # only a problem if they share the dataset, do classification with integers.
            # change to factor here, to avoid the problem
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            if not DO_FAIL:
                execExpr += "; factor(%s[, 378+1]);" % (trainKey)

            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            csvPathname = importFolderPath + "/" + testFilename
            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            # plus 1 for R indexing
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            if not DO_FAIL:
                execExpr += "; factor(%s[, 378+1]);" % (testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            # add 1 for start-with-1
            ignored_cols_by_name = ",".join(
                map(lambda x: "C" + str(x + 1), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % 'C' + str(
                response + 1)

            ntrees = 10
            trial = 0
            # ignore 200 random cols (not the response)
            print "Kicking off multiple GBM jobs at once"
            # GBM train****************************************
            if DO_FAIL:
                cases = [5, 10, 20, 40]
            else:
                cases = [5, 10, 20]

            for max_depth in cases:
                trial += 1

                params = {
                    'response': "C" + str(response + 1),
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'validation': parseTestResult['destination_key'],
                    'ignored_cols_by_name': ignored_cols_by_name,
                    'grid_parallelism': 1,
                    'classification': 1 if DO_CLASSIFICATION else 0,
                }

                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                trainStart = time.time()
                # can take 4 times as long with 4 jobs?
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                noPoll=True,
                                                timeoutSecs=timeoutSecs * 4,
                                                destination_key=modelKey +
                                                "_" + str(trial),
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename

            statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                              pollTimeoutSecs=timeoutSecs,
                                              retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                             pollTimeoutSecs=timeoutSecs)
Exemplo n.º 13
0
    def test_PCA_ignore_enums_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100, 3, 'cA', 300), 
            # (10001, 2, 'cA', 300), 
            # (10000, 500, 'cH', 300), 
            # (10000, 1000, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            start = time.time()
            modelKey = 'PCAModelKey'

            # Parse ****************************************
            parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # PCA(tolerance iterate)****************************************
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'ignored_cols': 'C1',
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                PCAResult['python_elapsed']  = elapsed
                PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])            
    
                print "Checking PCA results: "
                pcaView = h2o_cmd.runPCAView(modelKey = modelKey) 
                h2o_pca.simpleCheckPCA(self,pcaView)
                h2o_pca.resultsCheckPCA(self,pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
Exemplo n.º 14
0
    def test_c7_rel(self):
        print "Running with h2o.beta_features=True for all"
        h2o.beta_features = True

        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        csvFilename = 'part-00000b'
        importFolderPath = '/mnt/0xcustomer-datasets/c2'
        csvPathname = importFolderPath + "/" + csvFilename

        # FIX! does 'separator=' take ints or ?? hex format
        # looks like it takes the hex string (two chars)
        start = time.time()
        # hardwire TAB as a separator, as opposed to white space (9)
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        numRows = inspect['numRows']
        numCols = inspect['numCols']

        # do summary of the parsed dataset last, since we know it fails on this dataset
        # does the json fail with too many??
        #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2)
        # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500)
        # can't do more than 1000
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=numCols, numRows=numRows)

        keepPattern = "oly_|mt_|b_"
        y = "is_purchase"
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        if DO_INSPECT:
            x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
            print "x:", x
        else:
            x = None

        kwargs = {
            # 'x': x,
            'response': y,
            # 'case_mode': '>',
            # 'case': 0,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 4,
            # 'thresholds': 0.5,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs)
        statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
        num_cpus = statMean['num_cpus'],
        my_cpu_pct = statMean['my_cpu_%'],
        sys_cpu_pct = statMean['sys_cpu_%'],
        system_load = statMean['system_load']
        # shouldn't need this?
        h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)

        # can't figure out how I'm supposed to get the model
        # GLMModel = glm['GLMModel']
        # modelKey = GLMModel['model_key']
        # glmView = h2o.nodes[0].glm_view(modelKey=modelKey)


        elapsed = time.time() - start
        print "glm completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Exemplo n.º 15
0
    def test_GBM_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilename = "mnist_training.csv.gz"
        timeoutSecs = 1800
        trialStart = time.time()

        # PARSE train****************************************
        trainKey = csvFilename + "_" + ".hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=importFolderPath + "/" +
                                       csvFilename,
                                       schema='put',
                                       hex_key=trainKey,
                                       timeoutSecs=timeoutSecs)

        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        # GBM (train)****************************************
        modelKey = "GBM_model"
        params = {
            'classification': 1,  # faster? 
            'destination_key': modelKey,
            'learn_rate': .1,
            'ntrees': 3,
            'max_depth': 8,
            'min_rows': 1,
            'response':
            0,  # this dataset has the response in the last col (0-9 to check)
            # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed?
        }

        kwargs = params.copy()
        timeoutSecs = 1800
        #noPoll -> False when GBM finished
        start = time.time()
        GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult,
                                        noPoll=True,
                                        **kwargs)
        h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200,
                                    pollTimeoutSecs=120,
                                    retryDelaySecs=5)
        elapsed = time.time() - start

        print "GBM training completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        errsLast = gbmTrainView['gbm_model']['errs'][-1]

        print "GBM 'errsLast'", errsLast
        if DO_CLASSIFICATION:
            cms = gbmTrainView['gbm_model']['cms']
            cm = cms[-1]['_arr']  # use the last one
            print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr']
            print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr']
            pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm)
        else:
            print "GBMTrainView:", h2o.dump_json(
                gbmTrainView['gbm_model']['errs'])
Exemplo n.º 16
0
    def test_c9_GBM_airlines_hdfs(self):
        h2o.beta_features = True

        files = [('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800,
                  'IsDepDelayed')]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GBM (train)****************************************
            for depth in [5, 15]:
                params = {
                    'destination_key':
                    "GBMKEY",
                    'learn_rate':
                    .2,
                    'nbins':
                    1024,
                    'ntrees':
                    10,
                    'max_depth':
                    depth,
                    'min_rows':
                    10,
                    'response':
                    response,
                    'ignored_cols_by_name':
                    'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed'
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                timeoutSecs = 1800
                start = time.time()
                print "Start time is: ", time.time()
                #noPoll -> False when GBM finished
                GBMResult = h2o_cmd.runGBM(parseResult=parseResult,
                                           noPoll=True,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                                  pollTimeoutSecs=30,
                                                  retryDelaySecs=5)
                num_cpus = statMean['num_cpus'],
                my_cpu_pct = statMean['my_cpu_%'],
                sys_cpu_pct = statMean['sys_cpu_%'],
                system_load = statMean['system_load']
                # shouldn't need this?
                h2j.pollWaitJobs(pattern="GBMKEY",
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs,
                                 retryDelaySecs=5)
                print "Finished time is: ", time.time()
                elapsed = time.time() - start
                print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
                #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY')
                #print GBMView['gbm_model']['errs']

        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Exemplo n.º 17
0
    def test_GLM2_mnist(self):
        if DO_HDFS:
            importFolderPath = "mnist"
            bucket = None
            schema = 'hdfs'
        else:
            importFolderPath = "mnist"
            bucket = 'home-0xdiag-datasets'
            schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema=schema,
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs)

            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTestResult['destination_key'],
                timeoutSecs=300,
                returnIgnoreX=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema=schema,
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTrainResult['destination_key'],
                timeoutSecs=300,
                returnIgnoreX=True)
            print "ignoreX:", ignoreX

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX,
                'response': 'C' + str(y + 1),
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
            }

            if DO_ALL_DIGITS:
                cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
            else:
                cases = [8]

            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                if DO_BUG:
                    execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (
                        trainKey, y + 1, y + 1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr = "A.hex=%s" % (trainKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1,
                                                                c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                if DO_BUG:
                    execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (
                        testKey, y + 1, y + 1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr = "B.hex=%s" % (testKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1,
                                                                c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack,
                                                timeoutSecs=timeoutSecs,
                                                pollTimeoutSecs=60,
                                                noPoll=True,
                                                **kwargs)
                print "\nglmFirstResult:", h2o.dump_json(glmFirstResult)
                job_key = glmFirstResult['job_key']
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                            pollTimeoutSecs=60,
                                            retryDelaySecs=5)

                # double check...how come the model is bogus?
                h2o_jobs.pollWaitJobs()
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][
                    -1]['_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key='B.hex',
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='B.hex',
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(pctWrong, 9,
                                "Should see less than 9% error (class = 4)")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Exemplo n.º 18
0
    def test_RF_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_testing.csv.gz", "mnist_testing.csv.gz",    600), 
            # ("a.csv", "b.csv", 60),
            # ("mnist_testing.csv.gz", "mnist_testing.csv.gz",    600), 
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename,
                hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            # x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, schema='local',
                hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True)

            params = {
                'response': 'C' + str(y+1),
                'cols': None,
                'ignored_cols_by_name': ignore_x,
                'classification': 1,
                'validation': None,
                'ntrees': 2,
                'max_depth': 20,
                'min_rows': None,
                'nbins': 1000,
                'mtries': None,
                'sample_rate': 0.66,
                'seed': None,

                }

        rfViewInitial = []
        for jobDispatch in range(1):
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            params['destination_key'] = 'RFModel_' + str('jobDispatch')
            kwargs = params.copy()
            timeoutSecs = 1200

            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, rfView=DO_POLL, **kwargs)
            elapsed = time.time() - start

            # print h2o.dump_json(rfResult)
            print "rf job dispatch end on ", trainCsvFilename, 'took', time.time() - start, 'seconds'
            print "\njobDispatch #", jobDispatch
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = trainKey2
            rfView['model_key'] = kwargs['destination_key']
            rfView['ntrees'] = kwargs['ntrees']
            rfViewInitial.append(rfView)

            if not DO_POLL:
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5)

        # FIX! need to add the rfview and predict stuff
        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that 
        # way rather than the inspect (to match what simpleCheckGLM is expected
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntrees = rfView['ntrees']

            rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, noPoll=not DO_POLL, doSimpleCheck=False)
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            self.assertAlmostEqual(classification_error, 10, delta=2, msg="Classification error %s differs too much" % classification_error)


            if not DO_POLL:
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=5)
            # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False)
            # print "rfView:", h2o.dump_json(rfView)

            # "N":1,
            # "errs":[0.25,0.1682814508676529],
            # "testKey":"syn_binary_10000x10.hex",
            # "cm":[[3621,1399],[1515,3465]]}}
            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            errs = rf_model['errs']

            # FIX! should update this expected classification error
            ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
Exemplo n.º 19
0
    def test_GBM_manyfiles_multijob(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey)
            # inc by 1 for R col
            # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate
            # only a problem if they share the dataset, do classification with integers.
            # change to factor here, to avoid the problem
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            if not DO_FAIL:
                execExpr +=  "; factor(%s[, 378+1]);" % (trainKey)

            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            csvPathname = importFolderPath + "/" + testFilename
            parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            # plus 1 for R indexing
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            if not DO_FAIL:
                execExpr +=  "; factor(%s[, 378+1]);" % (testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            ignored_cols_by_name = ",".join(map(lambda x: "C" + str(x), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            trial = 0
            # ignore 200 random cols (not the response)
            print "Kicking off multiple GBM jobs at once"
            # GBM train****************************************
            if DO_FAIL:
                cases = [5, 10, 20, 40]
            else:
                cases = [5, 10, 20]

            for max_depth in cases:
                trial += 1

                params = {
                    'response': "C" + str(response),
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'validation': parseTestResult['destination_key'],
                    'ignored_cols_by_name': ignored_cols_by_name,
                    'grid_parallelism': 1,
                    'classification': 1 if DO_CLASSIFICATION else 0,
                }
            

                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                trainStart = time.time()
                # can take 4 times as long with 4 jobs?
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs * 4, destination_key=modelKey + "_" + str(trial), **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename


            statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5)
            num_cpus = statMean['num_cpus'],
            my_cpu_pct = statMean['my_cpu_%'],
            sys_cpu_pct = statMean['sys_cpu_%'],
            system_load = statMean['system_load']

            h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
Exemplo n.º 20
0
    def test_c7_fvec(self):
        print "Since the python is not necessarily run as user=0xcust.."
        print "r can't use schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # apparently h2o will create a "_" to replace the "-"..so lets force the destination key name
        csvFilename = "part-00000b"
        hex_key = "part_00000b.hex"
        importFolderPath = '/mnt/0xcustomer-datasets/c2'
        csvPathname = importFolderPath + "/" + csvFilename

        # FIX! does 'separator=' take ints or ?? hex format
        # looks like it takes the hex string (two chars)
        start = time.time()
        # hardwire TAB as a separator, as opposed to white space (9)
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='local',
                                       separator=9,
                                       hex_key=hex_key,
                                       doSummary=False,
                                       timeoutSecs=500)

        print "Parse of", parseResult['destination_key'], "took", time.time(
        ) - start, "seconds"
        print "Parse result['destination_key']:", parseResult[
            'destination_key']

        inspect = h2o_cmd.runInspect(None, hex_key, timeoutSecs=500)
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        numRows = inspect['numRows']
        numCols = inspect['numCols']

        # do summary of the parsed dataset last, since we know it fails on this dataset
        # does the json fail with too many??
        #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2)
        # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500)
        # can't do more than 1000
        summaryResult = h2o_cmd.runSummary(key=hex_key,
                                           numCols=numCols,
                                           numRows=numRows,
                                           timeoutSecs=500)

        # there may be a lot NAs.
        # we don't want to ignore any cols, and we don't want to ignore row
        # so impute to median

        # zero indexed column
        for column in range(numCols):
            print "Imputing any NAs in column %s to median" % column
            impResult = h2o.nodes[0].impute(source=hex_key,
                                            column=column,
                                            method='median')

        # check that there are no missing now
        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect)
        if len(missingValuesList) != 0:
            raise Exception("Shouldn't be missing values after impute: %s" %
                            missingValuesList)

        keepPattern = "oly_|mt_|b_"
        y = "is_purchase"
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y,
                                        keepPattern=keepPattern,
                                        key=hex_key)
        print "x:", x

        kwargs = {
            'response': y,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 10,
            # 'thresholds': 0.5,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
        }

        timeoutSecs = 3600
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult,
                             timeoutSecs=timeoutSecs,
                             pollTimeoutSecs=60,
                             noPoll=True,
                             **kwargs)
        statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                          pollTimeoutSecs=30,
                                          retryDelaySecs=5)
        num_cpus = statMean['num_cpus'],
        my_cpu_pct = statMean['my_cpu_%'],
        sys_cpu_pct = statMean['sys_cpu_%'],
        system_load = statMean['system_load']
        # shouldn't need this?
        h2j.pollWaitJobs(pattern=None,
                         timeoutSecs=timeoutSecs,
                         pollTimeoutSecs=30,
                         retryDelaySecs=5)

        # can't figure out how I'm supposed to get the model
        # GLMModel = glm['GLMModel']
        # modelKey = GLMModel['model_key']
        # glmView = h2o.nodes[0].glm_view(modelKey=modelKey)

        elapsed = time.time() - start
        print "glm completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Exemplo n.º 21
0
    def test_RF_mnist_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_testing.csv.gz", "mnist_testing.csv.gz",    600),
            # ("a.csv", "b.csv", 60),
            # ("mnist_testing.csv.gz", "mnist_testing.csv.gz",    600),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           testCsvFilename,
                                           hex_key=testKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            # x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           trainCsvFilename,
                                           schema='local',
                                           hex_key=trainKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            print "This is the 'ignore=' we'll use"
            ignore_x = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseResult['destination_key'],
                timeoutSecs=300,
                forRF=True)

            params = {
                'response': 'C' + str(y),
                'cols': None,
                'ignored_cols_by_name': ignore_x,
                'classification': 1,
                'validation': None,
                'ntrees': 10,
                'max_depth': 20,
                'min_rows': None,
                'nbins': 1000,
                'mtries': None,
                'sample_rate': 0.66,
                'seed': None,
            }

        rfViewInitial = []
        for jobDispatch in range(1):
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            params['destination_key'] = 'RFModel_' + str('jobDispatch')
            kwargs = params.copy()
            timeoutSecs = 1200

            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     noPoll=not DO_POLL,
                                     rfView=DO_POLL,
                                     **kwargs)
            elapsed = time.time() - start

            # print h2o.dump_json(rfResult)
            print "rf job dispatch end on ", trainCsvFilename, 'took', time.time(
            ) - start, 'seconds'
            print "\njobDispatch #", jobDispatch
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = trainKey2
            rfView['model_key'] = kwargs['destination_key']
            rfView['ntrees'] = kwargs['ntrees']
            rfViewInitial.append(rfView)

            if not DO_POLL:
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200,
                                            pollTimeoutSecs=120,
                                            retryDelaySecs=5)

        # FIX! need to add the rfview and predict stuff
        # we saved the initial response?
        # if we do another poll they should be done now, and better to get it that
        # way rather than the inspect (to match what simpleCheckGLM is expected
        print "rfViewInitial", rfViewInitial
        for rfView in rfViewInitial:
            print "Checking completed job:", rfView
            print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntrees = rfView['ntrees']

            rfView = h2o_cmd.runRFView(None,
                                       model_key=model_key,
                                       timeoutSecs=60,
                                       noPoll=not DO_POLL,
                                       doSimpleCheck=False)
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            self.assertAlmostEqual(
                classification_error,
                10,
                delta=2,
                msg="Classification error %s differs too much" %
                classification_error)

            if not DO_POLL:
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=300,
                                            pollTimeoutSecs=120,
                                            retryDelaySecs=5)
            # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False)
            # print "rfView:", h2o.dump_json(rfView)

            # "N":1,
            # "errs":[0.25,0.1682814508676529],
            # "testKey":"syn_binary_10000x10.hex",
            # "cm":[[3621,1399],[1515,3465]]}}
            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']

            # FIX! should update this expected classification error
            ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=data_key)
Exemplo n.º 22
0
    def test_GLM2_mnist_short(self):
        importFolderPath = "mnist"
        bucket = 'home-0xdiag-datasets'
        schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, 
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            
            # first col is pixel value ..use 0 here
            y = 0
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTestResult['destination_key'], timeoutSecs=300, returnIgnoreX=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, 
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTrainResult['destination_key'], timeoutSecs=300, 
                returnIgnoreX=True)
            print "ignoreX:", ignoreX 

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX, 
                # first column is pixel value
                'response': 'C' + str(y+1),
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
                }

            cases = [8]
            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                execExpr="A.hex=%s" % (trainKey)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                h2o_cmd.runSummary(key=trainKey, cols=0, max_ncols=1, noPrint=False)
                h2o_cmd.runSummary(key='A.hex', cols=0, max_ncols=1, noPrint=False)

                execExpr="B.hex=%s" % (testKey)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr="B.hex[,%s]=(B.hex[,%s]==%s)" % (y+1, y+1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                h2o_cmd.runSummary(key=testKey, cols=0, max_ncols=1, noPrint=False)
                h2o_cmd.runSummary(key='B.hex', cols=0, max_ncols=1, noPrint=False)


                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, 
                    noPoll=True, **kwargs)
                print "\nglmFirstResult:", h2o.dump_json(glmFirstResult)
                job_key = glmFirstResult['job_key']
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5)

                # double check...how come the model is bogus?
                h2o_jobs.pollWaitJobs()
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]['_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
Exemplo n.º 23
0
    def test_c7_fvec(self):
        print "Since the python is not necessarily run as user=0xcust.."
        print "r can't use schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        csvFilename = 'part-00000b'
        hex_key = csvFilename = ".hex"
        importFolderPath = '/mnt/0xcustomer-datasets/c2'
        csvPathname = importFolderPath + "/" + csvFilename

        # FIX! does 'separator=' take ints or ?? hex format
        # looks like it takes the hex string (two chars)
        start = time.time()
        # hardwire TAB as a separator, as opposed to white space (9)
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"
        print "Parse result['destination_key']:", parseResult['destination_key']

        inspect = h2o_cmd.runInspect(None, hex_key, timeoutSecs=500)
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        numRows = inspect['numRows']
        numCols = inspect['numCols']

        # do summary of the parsed dataset last, since we know it fails on this dataset
        # does the json fail with too many??
        #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2)
        # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500)
        # can't do more than 1000
        summaryResult = h2o_cmd.runSummary(key=hex_key, numCols=numCols, numRows=numRows, timeoutSecs=500)

        # there may be a lot NAs. 
        # we don't want to ignore any cols, and we don't want to ignore row
        # so impute to median
        
        # zero indexed column
        for column in range(numCols):
            print "Imputing any NAs in column %s to median" % column
            impResult = h2o.nodes[0].impute(source=hex_key, column=column, method='median')

        # check that there are no missing now
        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect)
        if len(missingValuesList)!=0:
            raise Exception ("Shouldn't be missing values after impute: %s" % missingValuesList)

        keepPattern = "oly_|mt_|b_"
        y = "is_purchase"
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=hex_key)
        print "x:", x

        kwargs = {
            'response': y,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 10,
            # 'thresholds': 0.5,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult, 
            timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs)
        statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
        num_cpus = statMean['num_cpus'],
        my_cpu_pct = statMean['my_cpu_%'],
        sys_cpu_pct = statMean['sys_cpu_%'],
        system_load = statMean['system_load']
        # shouldn't need this?
        h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)

        # can't figure out how I'm supposed to get the model
        # GLMModel = glm['GLMModel']
        # modelKey = GLMModel['model_key']
        # glmView = h2o.nodes[0].glm_view(modelKey=modelKey)

        elapsed = time.time() - start
        print "glm completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Exemplo n.º 24
0
    def test_c10_rel_gbm(self):
        h2o.beta_features = True
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Test***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        testFilename = 'classification1Test.txt'
        testPathname = importFolderPath + "/" + testFilename

        start = time.time()
        parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True)
        print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        trainFilename = 'classification1Train.txt'
        trainPathname = importFolderPath + "/" + trainFilename

        start = time.time()
        parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', timeoutSecs=500, doSummary=True)
        print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, trainPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']
        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # keepList = []
        # h2o_glm.findXFromColumnInfo(key=parseTrainResult['destination_key'], keepList=keepList)
        # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices
        # GBM Train***********************************************************
        x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
        # response = 0
        # doesn't work if index is used?
        response = 'outcome'

        # x = range(inspect['num_cols'])
        # del x[response]
        ntrees = 100
        # fails with 40
        params = {
            'learn_rate': .2,
            'nbins': 1024,
            'ntrees': ntrees,
            'max_depth': 20,
            'min_rows': 2,
            'response': response,
            'cols': x,
            # 'ignored_cols_by_name': None,
        }
        print "Using these parameters for GBM: ", params
        kwargs = params.copy()
        modelKey = 'GBMModelKey'

        timeoutSecs = 900

        trainStart = time.time()
        gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
            noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
        # hack
        h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
        trainElapsed = time.time() - trainStart
        print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        # errrs from end of list? is that the last tree?
        errsLast = gbmTrainView['gbm_model']['errs'][-1]
        print "GBM 'errsLast'", errsLast

        cm = gbmTrainView['gbm_model']['cm']
        pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
        print "Last line of this cm might be NAs, not CM"
        print "\nTrain\n==========\n"
        print h2o_gbm.pp_cm(cm)

        # GBM test****************************************
        predictKey = 'Predict.hex'
        h2o_cmd.runInspect(key=parseTestResult['destination_key'])
        start = time.time()
        gbmTestResult = h2o_cmd.runPredict(
            data_key=parseTestResult['destination_key'],
            model_key=modelKey,
            destination_key=predictKey,
            timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename


        if DO_PREDICT_CM:
            gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                actual=parseTestResult['destination_key'],
                vactual='predict',
                predict=predictKey,
                vpredict='predict', # choices are 7 (now) and 'predict'
                )

            # errrs from end of list? is that the last tree?
            # all we get is cm
            cm = gbmPredictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "Last line of this cm is really NAs, not CM"
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)
Exemplo n.º 25
0
    def test_GLM2_mnist(self):
        h2o.beta_features = True
        if DO_HDFS:
            importFolderPath = "mnist"
            bucket = None
            schema = 'hdfs'
        else:
            importFolderPath = "mnist"
            bucket = 'home-0xdiag-datasets'
            schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs)
            
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True)
            print "ignoreX:", ignoreX 

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX, 
                'response': 'C' + str(y),
                # 'case_mode': '=',
                # 'case_val': 0,
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                ## 'weight': 1.0,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
                }

            if DO_ALL_DIGITS:
                cases = [0,1,2,3,4,5,6,7,8,9]
            else:
                cases = [8]

            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                if DO_BUG:
                    execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr="A.hex=%s" % (trainKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                if DO_BUG:
                    execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr="B.hex=%s" % (testKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr="B.hex[,%s]=(B.hex[,%s]==%s)" % (y+1, y+1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, 
                    noPoll=True, **kwargs)
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5)
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_selfKey']

                # This seems wrong..what's the format of the cm?
                if 1==0:
                    cm = glm['glm_model']['submodels'][0]['validation']['_cms'][0]['_arr']
                    print "cm:", cm
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                    print "\nTrain\n==========\n"
                    print h2o_gbm.pp_cm(cm)


                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(
                    data_key='B.hex',
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='B.hex',
                    vactual='C' + str(y),
                    predict=predictKey,
                    vpredict='predict',
                    )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Exemplo n.º 26
0
    def test_GLM2_mnist_short(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        bucket = 'home-0xdiag-datasets'
        schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema=schema,
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)

            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"

            # first col is pixel value ..use 0 here
            y = 0
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTestResult['destination_key'],
                timeoutSecs=300,
                forRF=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema=schema,
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTrainResult['destination_key'],
                timeoutSecs=300,
                forRF=True)
            print "ignoreX:", ignoreX

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX,
                # first column is pixel value
                'response': 'C' + str(y + 1),
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
            }

            cases = [8]
            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                execExpr = "A.hex=%s" % (trainKey)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                h2o_cmd.runSummary(key=trainKey,
                                   cols=0,
                                   max_ncols=1,
                                   noPrint=False)
                h2o_cmd.runSummary(key='A.hex',
                                   cols=0,
                                   max_ncols=1,
                                   noPrint=False)

                execExpr = "B.hex=%s" % (testKey)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                h2o_cmd.runSummary(key=testKey,
                                   cols=0,
                                   max_ncols=1,
                                   noPrint=False)
                h2o_cmd.runSummary(key='B.hex',
                                   cols=0,
                                   max_ncols=1,
                                   noPrint=False)

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack,
                                                timeoutSecs=timeoutSecs,
                                                pollTimeoutSecs=60,
                                                noPoll=True,
                                                **kwargs)
                print "\nglmFirstResult:", h2o.dump_json(glmFirstResult)
                job_key = glmFirstResult['job_key']
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                            pollTimeoutSecs=60,
                                            retryDelaySecs=5)

                # double check...how come the model is bogus?
                h2o_jobs.pollWaitJobs()
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                    (elapsed * 100) / timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][
                    -1]['_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
Exemplo n.º 27
0
    def test_c7_rel(self):
        print "Running with h2o.beta_features=True for all"
        h2o.beta_features = True

        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        csvFilename = 'part-00000b'
        importFolderPath = '/mnt/0xcustomer-datasets/c2'
        csvPathname = importFolderPath + "/" + csvFilename

        # FIX! does 'separator=' take ints or ?? hex format
        # looks like it takes the hex string (two chars)
        start = time.time()
        # hardwire TAB as a separator, as opposed to white space (9)
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        numRows = inspect['numRows']
        numCols = inspect['numCols']

        # do summary of the parsed dataset last, since we know it fails on this dataset
        # does the json fail with too many??
        #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2)
        # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500)
        # can't do more than 1000
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=numCols, numRows=numRows)

        keepPattern = "oly_|mt_|b_"
        y = "is_purchase"
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "x:", x

        kwargs = {
            'response': y,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 10,
            # 'thresholds': 0.5,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs)
        statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)
        num_cpus = statMean['num_cpus'],
        my_cpu_pct = statMean['my_cpu_%'],
        sys_cpu_pct = statMean['sys_cpu_%'],
        system_load = statMean['system_load']
        # shouldn't need this?
        h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5)

        # can't figure out how I'm supposed to get the model
        # GLMModel = glm['GLMModel']
        # modelKey = GLMModel['model_key']
        # glmView = h2o.nodes[0].glm_view(modelKey=modelKey)


        elapsed = time.time() - start
        print "glm completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Exemplo n.º 28
0
    def test_c10_rel_gbm(self):
        h2o.beta_features = True
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Test***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        testFilename = 'classification1Test.txt'
        testPathname = importFolderPath + "/" + testFilename

        start = time.time()
        parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True)
        print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        trainFilename = 'classification1Train.txt'
        trainPathname = importFolderPath + "/" + trainFilename

        start = time.time()
        parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', timeoutSecs=500, doSummary=True)
        print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, trainPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']
        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # GBM Train***********************************************************
        x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
        # response = 0
        # doesn't work if index is used?
        response = 'outcome'

        # x = range(inspect['num_cols'])
        # del x[response]
        ntrees = 10
        # fails with 40
        params = {
            'learn_rate': .2,
            'nbins': 1024,
            'ntrees': ntrees,
            'max_depth': 20,
            'min_rows': 2,
            'response': response,
            'cols': x,
            # 'ignored_cols_by_name': None,
        }
        print "Using these parameters for GBM: ", params
        kwargs = params.copy()
        modelKey = 'GBMModelKey'

        timeoutSecs = 900

        trainStart = time.time()
        gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
            noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
        # hack
        h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
        trainElapsed = time.time() - trainStart
        print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

        gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
        # errrs from end of list? is that the last tree?
        errsLast = gbmTrainView['gbm_model']['errs'][-1]
        print "GBM 'errsLast'", errsLast

        # get the last cm
        cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
        pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
        print "Last line of this cm might be NAs, not CM"
        print "\nTrain\n==========\n"
        print h2o_gbm.pp_cm(cm)

        # GBM test****************************************
        predictKey = 'Predict.hex'
        h2o_cmd.runInspect(key=parseTestResult['destination_key'])
        start = time.time()
        gbmTestResult = h2o_cmd.runPredict(
            data_key=parseTestResult['destination_key'],
            model_key=modelKey,
            destination_key=predictKey,
            timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename


        if DO_PREDICT_CM:
            gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                actual=parseTestResult['destination_key'],
                vactual='predict',
                predict=predictKey,
                vpredict='predict', # choices are 7 (now) and 'predict'
                )

            # errrs from end of list? is that the last tree?
            # all we get is cm
            cm = gbmPredictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            print "Last line of this cm is really NAs, not CM"
            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)