def test_GBMGrid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one # check the first in the models list. It should be the best colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ] modelKey = 'GBMGrid_benign' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'STR', 'learn_rate': '.1,.2,.25', 'ntrees': '3:5:1', 'max_depth': '5,7', 'min_rows': '1,2', 'response': 'FNDX', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." h2o_gbm.showGBMGridResults(GBMResult, 0)
def test_GBMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = "logreg/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") colNames = ["ID", "CAPSULE", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"] modelKey = "GBMGrid_prostate" # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { "destination_key": modelKey, "ignored_cols_by_name": "ID", "learn_rate": ".1,.2", "ntrees": "1:3:1", "max_depth": "8,9", "min_rows": "1:5:2", "response": "CAPSULE", "classification": 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." h2o_gbm.showGBMGridResults(GBMResult, 10)
def test_GBMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': '.1,.2', 'ntrees': '1:3:1', 'max_depth': '8,9', 'min_rows': '1:5:2', 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." h2o_gbm.showGBMGridResults(GBMResult, 15)
def test_GBM_parseTrain(self): bucket = 'home-0xdiag-datasets' files = [('standard', 'covtype.data', 'covtype.hex', 1800, 54) ] for importFolderPath,csvFilename,trainKey,timeoutSecs,response in files: # PARSE train**************************************** start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** params = { 'destination_key': "GBMKEY", 'learn_rate':.1, 'ntrees':1, 'max_depth':1, 'min_rows':1, 'response':response } print "Using these parameters for GBM: ", params kwargs = params.copy() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs) h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800) #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \ # "%f pct. of timeout" % (GBMResult['python_%timeout']) GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') print GBMView['gbm_model']['errs']
def test_GBMGrid_basic_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': .1, 'ntrees': '4,100', 'max_depth': 8, 'min_rows': 1, 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs) if not DO_POLL: print "\nfirst GBMResult:", h2o.dump_json(GBMResult) statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." # FIX! after gbm grid, have to get the model keys from the json? gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey) print h2o.dump_json(gbmGridView) if 1==0: gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_GBM_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilename = "mnist_training.csv.gz" timeoutSecs = 1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=importFolderPath + "/" + csvFilename, schema="put", hex_key=trainKey, timeoutSecs=timeoutSecs, ) elapsed = time.time() - start print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GBM (train)**************************************** modelKey = "GBM_model" params = { "classification": 1, # faster? "destination_key": modelKey, "learn_rate": 0.1, "ntrees": 3, "max_depth": 8, "min_rows": 1, "response": 0, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() timeoutSecs = 1800 # noPoll -> False when GBM finished start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView["gbm_model"]["errs"][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView["gbm_model"]["cms"][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"])
def test_GBM_parseTrain(self): #folderpath, filename, keyname, timeout bucket = 'home-0xdiag-datasets' files = [('mnist', 'mnist_training.csv.gz', 'mnistsmalltrain.hex',1800,0) ] grid = [[1,10,100,1000], [0.0,0.01,0.001,0.0001,1], [1,2], [1,10,100]] grid = list(itertools.product(*grid)) grid = random.sample(grid, 10) #don't do all 120, but get a random sample for importFolderPath,csvFilename,trainKey,timeoutSecs,response in files: # PARSE train**************************************** start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] csv_header = ('nJVMs','java_heap_GB', 'dataset', 'ntrees', 'max_depth', 'learn_rate', 'min_rows','trainTime') for ntree, learn_rate, max_depth, min_rows in grid: if not os.path.exists('gbm_grid.csv'): output = open('gbm_grid.csv', 'w') output.write(','.join(csv_header)+'\n') else: output = open('gbm_grid.csv', 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') java_heap_GB = h2o.nodes[0].java_heap_GB params = { 'destination_key': 'GBMKEY', 'learn_rate': learn_rate, 'ntrees':ntree, 'max_depth':max_depth, 'min_rows':min_rows, 'response':response } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True #noPoll -> False when GBM finished start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult,noPoll=True,timeoutSecs=timeoutSecs,**kwargs) h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=3600,pollTimeoutSecs=3600) #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \ # "%f pct. of timeout" % (GBMResult['python_%timeout']) #print GBMResult GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') print GBMView['gbm_model']['errs'] elapsed = time.time() - start row = {'nJVMs':len(h2o.nodes),'java_heap_GB':java_heap_GB,'dataset':'mnist_training.csv.gz', 'learn_rate':learn_rate,'ntrees':ntree,'max_depth':max_depth, 'min_rows':min_rows, 'trainTime':elapsed} print row csvWrt.writerow(row)
def test_GBMGrid_basic_many(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': '.1,.2', 'ntrees': '8,10', 'max_depth': '8,9', 'min_rows': '1,2', 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 1, } kwargs = params.copy() timeoutSecs = 1800 jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 # for more in range(8): # fast # for more in range(9): for i in range(5): kwargs = params.copy() kwargs['min_rows'] = '1,2,3' if DO_FROM_TO_STEP: kwargs['max_depth'] = '5:10:1' else: kwargs['max_depth'] = '5,6,10' GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) # print "GBMResult:", h2o.dump_json(GBMResult) job_key = GBMResult['job_key'] model_key = GBMResult['destination_key'] jobs.append( (job_key, model_key) ) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start for job_key, model_key in jobs: GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key, destination_key=model_key) h2o_gbm.showGBMGridResults(GBMResult, 15) print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs
def test_c9b_GBM_airlines_hdfs(self): h2o.beta_features = True files = [ ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** # passes 5, fails 15 # for depth in [5,15,25,40]: for depth in [5,5,5,5,5]: params = { 'destination_key': "GBMKEY", 'learn_rate': .2, 'nbins': 1024, 'ntrees': 10, 'max_depth': depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } print "Using these parameters for GBM: ", params kwargs = params.copy() start = time.time() print "Start time is: ", time.time() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') #print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_GBM_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilename = "mnist_training.csv.gz" timeoutSecs=1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** modelKey = "GBM_model" params = { 'classification': 1, # faster? 'destination_key': modelKey, 'learn_rate': .1, 'ntrees': 3, 'max_depth': 8, 'min_rows': 1, 'response': 0, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() timeoutSecs = 1800 #noPoll -> False when GBM finished start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cms = gbmTrainView['gbm_model']['cms'] cm = cms[-1]['_arr'] # use the last one print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr'] print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_GBM_covtype_train_test(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' trainFilename = 'covtype.shuffled.90pct.data' trainKey = 'covtype.train.hex' modelKey = 'GBMModelKey' timeoutSecs = 1800 parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) kwargs = { 'max_depth': 7, # 'max_depth': 10, 'ntrees': 2, 'response': 'C55' } h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
def test_GBMGrid_basic_benign(self): h2o.beta_features = True csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" # check the first in the models list. It should be the best colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ] modelKey = 'GBMGrid_benign' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'STR', 'learn_rate': '.1,.2', 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': 'FNDX', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs) if not DO_POLL: # no pattern waits for all print "\nfirst GBMResult:", h2o.dump_json(GBMResult) h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey) if 1==0: # FIX! get model? gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_GBMScore(self): h2o.beta_features = True importFolderPath = "standard" csvTrainPath = importFolderPath + "/allyears2k.csv" csvTestPath = csvTrainPath # importFolderPath = 'newairlines' # csvTrainPath = importFolderPath + '/train/*train*' # csvTestPath = importFolderPath + '/train/*test*' trainhex = "train.hex" testhex = "test.hex" parseTrainResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvTrainPath, schema="local", hex_key=trainhex, timeoutSecs=2400, doSummary=False, ) parseTestResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvTestPath, schema="local", hex_key=testhex, timeoutSecs=2400, doSummary=False, ) inspect_test = h2o.nodes[0].inspect(testhex, timeoutSecs=8000) response = "IsDepDelayed" ignored_cols = "DepTime,ArrTime,FlightNum,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed" params = { "destination_key": "GBMScore", "response": response, "ignored_cols_by_name": ignored_cols, "classification": 1, "validation": None, "ntrees": 100, "max_depth": 10, "learn_rate": 0.00005, } parseResult = {"destination_key": trainhex} kwargs = params.copy() gbm = h2o_cmd.runGBM(parseResult=parseResult, timeoutSecs=4800, **kwargs) scoreStart = time.time() h2o.nodes[0].generate_predictions(model_key="GBMScore", data_key=trainhex) scoreElapsed = time.time() - scoreStart print "It took ", scoreElapsed, " seconds to score ", inspect_test[ "numRows" ], " rows. Using a GBM with 100 10-deep trees." print "That's ", 1.0 * scoreElapsed / 100.0, " seconds per 10-deep tree."
def test_c9_GBM_airlines_hdfs(self): h2o.beta_features = True files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GBM (train)**************************************** for depth in [5, 15]: params = { "destination_key": "GBMKEY", "learn_rate": 0.2, "nbins": 1024, "ntrees": 10, "max_depth": depth, "min_rows": 10, "response": response, "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed", } print "Using these parameters for GBM: ", params kwargs = params.copy() timeoutSecs = 1800 start = time.time() print "Start time is: ", time.time() # noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = (statMean["num_cpus"],) my_cpu_pct = (statMean["my_cpu_%"],) sys_cpu_pct = (statMean["sys_cpu_%"],) system_load = statMean["system_load"] # shouldn't need this? h2j.pollWaitJobs( pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5 ) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename # GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') # print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_GBMGrid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = "logreg/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") # columns start at 0 # cols 0-13. 3 is output # no member id in this one # check the first in the models list. It should be the best colNames = [ "STR", "OBS", "AGMT", "FNDX", "HIGD", "DEG", "CHK", "AGP1", "AGMN", "NLV", "LIV", "WT", "AGLP", "MST", ] modelKey = "GBMGrid_benign" # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { "destination_key": modelKey, "ignored_cols_by_name": "STR", "learn_rate": ".1,.2,.25", "ntrees": "3:5:1", "max_depth": "5,7", "min_rows": "1,2", "response": "FNDX", "classification": 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, **kwargs) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." h2o_gbm.showGBMGridResults(GBMResult, 0)
def test_GBMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = "logreg/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") colNames = ["ID", "CAPSULE", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"] modelKey = "GBMGrid_prostate" # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { "destination_key": modelKey, "ignored_cols_by_name": "ID", "learn_rate": 0.1, "ntrees": "2,4", "max_depth": 8, "min_rows": 1, "response": "CAPSULE", "classification": 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMFirstResult["job_key"], destination_key=modelKey) print h2o.dump_json(gbmGridView) if 1 == 0: gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView["gbm_model"]["errs"][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView["gbm_model"]["cm"] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"])
def test_GBMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': .1, 'ntrees': 'c(2,4)', 'max_depth': 8, 'min_rows': 1, 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMFirstResult['job_key'], destination_key=modelKey) print h2o.dump_json(gbmGridView) if 1==0: gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_GBM_ec2_airlines(self): h2o.beta_features = False files = [ ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** for depth in [5,15]: params = { 'destination_key': "GBMKEY", 'learn_rate': .2, 'nbins': 1024, 'ntrees': 10, 'max_depth': depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True start = time.time() print "Start time is: ", time.time() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs) h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
def test_GBM_ec2_airlines(self): h2o.beta_features = False files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: h2o.beta_features = False # turn off beta_features # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GBM (train)**************************************** # passes 5, fails 15 # for depth in [5,15,25,40]: for depth in [5, 5, 5, 5, 5]: params = { "destination_key": "GBMKEY", "learn_rate": 0.2, "nbins": 1024, "ntrees": 10, "max_depth": depth, "min_rows": 10, "response": response, "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed", } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True start = time.time() print "Start time is: ", time.time() # noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) h2j.pollWaitJobs(pattern="GBMKEY", timeoutSecs=1800, pollTimeoutSecs=1800) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename
def test_GBM_mnist_restart_many(self): importFolderPath = "mnist" csvFilename = "train.csv.gz" timeoutSecs=1800 trialStart = time.time() for trial in range(10): # PARSE train**************************************** trainKey = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 10, 'max_depth': 8, 'min_rows': 1, 'response': 784, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o.beta_features = False # if it fails, should happen within 8 secs time.sleep(8) h2j.cancelAllJobs() h2o.check_sandbox_for_errors() print "Trial %s: GBM start didn't have any errors after 8 seconds. cancelled. Will delete all keys now." % trial if DO_DELETE_KEYS_AND_CAUSE_PROBLEM: h2i.delete_keys_at_all_nodes()
def test_GBM_mnist(self): importFolderPath = "mnist" csvFilename = "train.csv.gz" timeoutSecs=1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 10, 'max_depth': 8, 'min_rows': 1, 'response': 784, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 #noPoll -> False when GBM finished start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) # hack! if h2o.beta_features: h2o_jobs.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_GBMScore(self): h2o.beta_features = False importFolderPath = 'standard' csvTrainPath = importFolderPath + '/allyears2k.csv' csvTestPath = csvTrainPath # importFolderPath = 'newairlines' # csvTrainPath = importFolderPath + '/train/*train*' # csvTestPath = importFolderPath + '/train/*test*' trainhex = 'train.hex' testhex = 'test.hex' parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path = csvTrainPath, schema = 'local', hex_key = trainhex, timeoutSecs = 2400, doSummary = False) parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path = csvTestPath, schema = 'local', hex_key = testhex, timeoutSecs = 2400, doSummary = False) inspect_test = h2o.nodes[0].inspect(testhex, timeoutSecs=8000) h2o.beta_features = True response = 'IsDepDelayed' ignored_cols = 'DepTime,ArrTime,FlightNum,TailNum,ActualElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' params = {'destination_key' : 'GBMScore', 'response' : response, 'ignored_cols_by_name' : ignored_cols, 'classification' : 1, 'validation' : None, 'ntrees' : 100, 'max_depth' : 10, 'learn_rate' : 0.00005, } parseResult = {'destination_key' : trainhex} kwargs = params.copy() gbm = h2o_cmd.runGBM(parseResult = parseResult, timeoutSecs=4800, **kwargs) scoreStart = time.time() h2o.nodes[0].generate_predictions(model_key = 'GBMScore', data_key = trainhex) scoreElapsed = time.time() - scoreStart print "It took ", scoreElapsed, " seconds to score ", inspect_test['num_rows'], " rows. Using a GBM with 100 10-deep trees." print "That's ", 1.0*scoreElapsed / 100.0 ," seconds per 10-deep tree."
def test_rf_enums_mappings(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' # limit depth and number of trees to accentuate the issue with categorical split decisions # use mtries so both look at all cols at every split? doesn't matter for speedrf # does speedrf try one more time? with 3 cols, mtries=2, so another try might # get a look at the missing col # does matter for drf2. does it "just stop" # trying mtries always looking at all columns or 1 col might be interesting if SPEEDRF: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'ntrees': 1, 'max_depth': 100, # 'oobee': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'mtries': COLS, } elif GBM: kwargs = { 'destination_key': modelKey, 'response': y, 'validation': scoreDataKey, 'seed': 123456789, # 'learn_rate': .1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'classification': 1, } else: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'nbins': 1024, 'mtries': COLS, } for r in range(2): start = time.time() if GBM: gbmResult = h2o_cmd.runGBM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "gbm end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' # print h2o.dump_json(gbmResult) (classification_error, classErrorPctList, totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult) elif SPEEDRF: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "speedrf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) else: rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex' ) ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_multijob(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) # inc by 1 for R col # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate # only a problem if they share the dataset, do classification with integers. # change to factor here, to avoid the problem execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** csvPathname = importFolderPath + "/" + testFilename parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! # plus 1 for R indexing execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] # add 1 for start-with-1 ignored_cols_by_name = ",".join( map(lambda x: "C" + str(x + 1), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % 'C' + str( response + 1) ntrees = 10 trial = 0 # ignore 200 random cols (not the response) print "Kicking off multiple GBM jobs at once" # GBM train**************************************** if DO_FAIL: cases = [5, 10, 20, 40] else: cases = [5, 10, 20] for max_depth in cases: trial += 1 params = { 'response': "C" + str(response + 1), 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'validation': parseTestResult['destination_key'], 'ignored_cols_by_name': ignored_cols_by_name, 'grid_parallelism': 1, 'classification': 1 if DO_CLASSIFICATION else 0, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() # can take 4 times as long with 4 jobs? gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs * 4, destination_key=modelKey + "_" + str(trial), **kwargs) trainElapsed = time.time() - trainStart print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
def test_GBM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if localhost: tryList = [ (100000, 400, 'cA', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (100000, 100, 'cD', 300), (100000, 200, 'cE', 300), (100000, 500, 'cG', 300), (100000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE train**************************************** h2o.beta_features = False #turn off beta_features start = time.time() xList = [] eList = [] fList = [] h2o.beta_features = False modelKey = 'GBMModelKey' # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" # l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( # len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) l = '{:d} jvms, {:d}MB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] h2o_cmd.infoFromInspect(inspect, csvPathname) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True # was failing with 100 trees # ntrees = 100 # for max_depth in [5,10,20,40]: ntrees = 10 for max_depth in [5]: params = { 'learn_rate': .2, 'nbins': 10, # 1024 fail 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': num_cols-1, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if DO_PLOT_IF_KEVIN: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_regression_rand2(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", trainKey # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", testKey paramsDict = define_gbm_params() for trial in range(3): # use this to set any defaults you want if the pick doesn't set print "Regression!" params = { 'response': 'C55', # 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0, 'validation': testKey, } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) print "gbmTrainView:", h2o.dump_json(gbmTrainView) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # for regression, the cms are all null, so don't print # GBM test**************************************** predictKey = 'Predict.hex' start = time.time() gbmTestResult = h2o_cmd.runPredict(data_key=testKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?"
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if h2o.localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5,10,20,40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilename = "mnist_training.csv.gz" timeoutSecs = 1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** modelKey = "GBM_model" params = { 'classification': 1, # faster? 'destination_key': modelKey, 'learn_rate': .1, 'ntrees': 3, 'max_depth': 8, 'min_rows': 1, 'response': 0, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() timeoutSecs = 1800 #noPoll -> False when GBM finished start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cms = gbmTrainView['gbm_model']['cms'] cm = cms[-1]['_arr'] # use the last one print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr'] print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs'])
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(num_cols) del x[response] ignored_cols_by_name = ",".join(map(str,random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': ignored_cols_by_name, } if FORCE_FAIL_CASE: params = {'learn_rate': 0.2, 'classification': None, 'min_rows': 10, 'ntrees': 10, 'response': 378, 'nbins': 1024, 'ignored_cols_by_name': '256, 382, 399, 50, 176, 407, 375, 113, 170, 313, 364, 33, 361, 426, 121, 371, 232, 327, 480, 75, 37, 312, 225, 195, 244, 406, 268, 230, 321, 257, 274, 197, 35, 501, 360, 72, 213, 79, 1, 466, 362, 160, 444, 437, 5, 59, 108, 454, 73, 374, 509, 337, 183, 252, 21, 314, 100, 200, 159, 379, 405, 367, 432, 181, 8, 420, 118, 284, 281, 465, 456, 359, 291, 330, 258, 523, 243, 487, 408, 392, 15, 231, 482, 481, 70, 171, 182, 31, 409, 492, 471, 53, 45, 448, 83, 527, 452, 350, 423, 93, 447, 130, 126, 54, 354, 169, 253, 49, 42, 431, 305, 498, 216, 189, 508, 122, 308, 228, 190, 293, 451, 63, 133, 304, 397, 425, 333, 19, 158, 391, 153, 282, 112, 64, 502, 7, 16, 469, 163, 136, 40, 99, 302, 264, 325, 434, 187, 311, 286, 278, 179, 109, 348, 287, 467, 400, 164, 384, 422, 43, 117, 91, 276, 211, 175, 329, 541, 438, 145, 534, 218, 177, 317, 222, 210, 162, 402, 98, 299, 245, 385, 233, 188, 516, 143, 13, 532, 429, 172, 455, 470, 518, 236, 296, 388, 468, 110, 395, 185, 25, 489, 196, 120, 435, 165, 168, 271, 74, 510, 36, 76, 208, 223, 270, 515, 421, 87, 66, 473, 220, 46, 486, 102, 38, 156, 48, 132, 331, 51, 403, 234, 23, 449, 341, 303, 410, 479, 203, 413, 512, 513, 9, 446, 511, 55, 6, 339, 418, 476, 178, 266, 22, 141, 259, 349, 86, 144, 34, 290, 326, 318, 519, 424, 127, 174, 472, 116, 17, 152, 280, 215, 514, 103, 377, 537, 373, 238, 47, 353, 428, 94, 214, 61, 123, 386, 351, 246, 411, 101, 249, 240, 520, 307, 288, 199, 147, 436, 77, 464, 414', 'source': u'test.hex', 'validation': u'test.hex', 'max_depth': 5} ### print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_cancel_model_reuse(self): h2o.beta_features = True importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename print "FIX! is this guy getting cancelled because he's reusing a key name? but it should be okay?" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult['destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: # Only integer or enum/factor columns can be classified if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1) kwargs = { 'str': execExpr } resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) ignoreIndex = [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response] # have to add 1 for col start with 1, now. plus the C xIgnore = ",".join(["C" + str(i+1) for i in ignoreIndex]) params = { 'destination_key': None, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response+1), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 for i in range(5): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs['destination_key'] = 'GBMBad' + str(j) # rjson error in poll_url: Job was cancelled by user! GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult['job_key']) h2o.check_sandbox_for_errors() # have to pass the job id # for j in jobids: # h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.cancelAllJobs() # PUB-361. going to wait after cancel before reusing keys time.sleep(3) # am I getting a subsequent parse job cancelled? h2o_jobs.showAllJobs() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_GBM_poker_1m(self): h2o.beta_features = True for trial in range(2): # PARSE train**************************************** h2o.beta_features = False #turn off beta_features start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' timeoutSecs = 900 # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = 'poker/poker-hand-testing.data' hex_key = 'poker-hand-testing.data.hex' parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True ntrees = 2 for max_depth in [5,10,20]: params = { 'learn_rate': .1, 'nbins': 10, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': numCols-1, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if DO_PLOT_IF_KEVIN: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_params_rand2(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect( key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set params = { 'response': 54, 'ignored_cols_by_name': 'C1,C2,C3,C4,C5', 'ntrees': 2, 'validation': parseTestResult['destination_key'], } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) if 'max_depth' in params and params['max_depth']: xList.append(params['max_depth']) eList.append(pctWrongTrain) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrongTrain' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_c10_rel_gbm(self): h2o.beta_features = True print "not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Test*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' testFilename = 'classification1Test.txt' testPathname = importFolderPath + "/" + testFilename start = time.time() parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' trainFilename = 'classification1Train.txt' trainPathname = importFolderPath + "/" + trainFilename start = time.time() parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500) print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, trainPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # GBM Train*********************************************************** x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] # response = 0 # doesn't work if index is used? response = 'outcome' # x = range(inspect['num_cols']) # del x[response] ntrees = 10 # fails with 40 params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': 20, 'min_rows': 2, 'response': response, 'cols': x, # 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() modelKey = 'GBMModelKey' timeoutSecs = 900 trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # get the last cm cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GBM_with_cancels(self): print "do import/parse with VA" h2o.beta_features = False importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False ) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {'destination_key': 'c.hex'} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult[ 'destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath == 'manyfiles-nflx-gz': if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response + 1, response + 1) kwargs = {'str': execExpr} resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now h2o.beta_features = True s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response ]: # have to add 1 for col start with 1, now. plus the C xIgnore.append("C" + str(i + 1)) else: # leave one col ignored, just to see? xIgnore = 'C1' modelKey = "GBMGood" params = { 'destination_key': modelKey, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response + 1), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all for i in range(15): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs['destination_key'] = 'GBMBad' + str(i) + str(j) GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult['job_key']) # have to pass the job id for j in jobids: h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs']) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_GBM_parseTrain(self): #folderpath, filename, keyname, timeout bucket = 'home-0xdiag-datasets' files = [('mnist', 'mnist_training.csv.gz', 'mnistsmalltrain.hex', 1800, 0)] grid = [[1, 10, 100, 1000], [0.0, 0.01, 0.001, 0.0001, 1], [1, 2], [1, 10, 100]] grid = list(itertools.product(*grid)) grid = random.sample(grid, 10) #don't do all 120, but get a random sample for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + csvFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] csv_header = ('nJVMs', 'java_heap_GB', 'dataset', 'ntrees', 'max_depth', 'learn_rate', 'min_rows', 'trainTime') for ntree, learn_rate, max_depth, min_rows in grid: if not os.path.exists('gbm_grid.csv'): output = open('gbm_grid.csv', 'w') output.write(','.join(csv_header) + '\n') else: output = open('gbm_grid.csv', 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') java_heap_GB = h2o.nodes[0].java_heap_GB params = { 'destination_key': 'GBMKEY', 'learn_rate': learn_rate, 'ntrees': ntree, 'max_depth': max_depth, 'min_rows': min_rows, 'response': response } print "Using these parameters for GBM: ", params kwargs = params.copy() #noPoll -> False when GBM finished start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) h2j.pollWaitJobs(pattern="GBMKEY", timeoutSecs=3600, pollTimeoutSecs=3600) #print "GBM training completed in", GBMResult['python_elapsed'], "seconds.", \ # "%f pct. of timeout" % (GBMResult['python_%timeout']) #print GBMResult GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') print GBMView['gbm_model']['errs'] elapsed = time.time() - start row = { 'nJVMs': len(h2o.nodes), 'java_heap_GB': java_heap_GB, 'dataset': 'mnist_training.csv.gz', 'learn_rate': learn_rate, 'ntrees': ntree, 'max_depth': max_depth, 'min_rows': min_rows, 'trainTime': elapsed } print row csvWrt.writerow(row)
def test_GBMGrid_basic_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = [ 'ID', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL', 'GLEASON' ] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': .1, 'ntrees': '4,100', 'max_depth': 8, 'min_rows': 1, 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs) if not DO_POLL: print "\nfirst GBMResult:", h2o.dump_json(GBMResult) statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." # FIX! after gbm grid, have to get the model keys from the json? gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey) print h2o.dump_json(gbmGridView) if 1 == 0: gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs'])
def test_GBMGrid_basic_many(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = [ 'ID', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL', 'GLEASON' ] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': '.1,.2', 'ntrees': '8,10', 'max_depth': '8,9', 'min_rows': '1,2', 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 1, } kwargs = params.copy() timeoutSecs = 1800 jobs = [] # kick off 5 of these GBM grid jobs, with different tree choices start = time.time() totalGBMGridJobs = 0 # for more in range(8): # fast # for more in range(9): for i in range(5): kwargs = params.copy() kwargs['min_rows'] = '1,2,3' if DO_FROM_TO_STEP: kwargs['max_depth'] = '5:10:1' else: kwargs['max_depth'] = '5,6,10' GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) # print "GBMResult:", h2o.dump_json(GBMResult) job_key = GBMResult['job_key'] model_key = GBMResult['destination_key'] jobs.append((job_key, model_key)) totalGBMGridJobs += 1 h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start for job_key, model_key in jobs: GBMResult = h2o.nodes[0].gbm_grid_view(job_key=job_key, destination_key=model_key) h2o_gbm.showGBMGridResults(GBMResult, 15) print "All GBM jobs completed in", elapsed, "seconds." print "totalGBMGridJobs:", totalGBMGridJobs
======= # 'ignored_cols_by_name': None, 'importance': importance, 'group_split': group_split, >>>>>>> 50f5b1b8c94b6ce7cd5ec175fecdca811f41487f } print "Using these parameters for GBM: ", params kwargs = params.copy() # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test****************************************
def test_c9_GBM_airlines_hdfs(self): h2o.beta_features = True files = [('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** for depth in [5, 15]: params = { 'destination_key': "GBMKEY", 'learn_rate': .2, 'nbins': 1024, 'ntrees': 10, 'max_depth': depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } print "Using these parameters for GBM: ", params kwargs = params.copy() timeoutSecs = 1800 start = time.time() print "Start time is: ", time.time() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') #print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_GBM_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename # hex_key = csvFilename + "_" + str(trial) + ".hex" hex_key = "C" + str(trial) start = time.time() timeoutSecs = 2 * 3600 kwargs = {} parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) # GBM **************************************** if not DO_GBM: continue # make col 2 a binomial (negative numbers in src col = 2 execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col, hex_key, col) resultExec = h2o_cmd.runExec(str=execExpr) params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': col # should be binomial from above } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) # wait for it to show up in jobs? time.sleep(2) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % ( GBMResult['python_%timeout']) print "\nGBMResult:", GBMResult # print "\nGBMResult:", h2o.dump_json(GBMResult) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_at_all_nodes()
def test_GBM_regression_rand2(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 'C54', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", trainKey # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", testKey paramsDict = define_gbm_params() for trial in range(3): # use this to set any defaults you want if the pick doesn't set print "Regression!" params = {'response': 'C54', 'ignored_cols_by_name': 'C5,C6,C7,C8,C9', 'ntrees': 2, 'classification': 0} h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # for regression, the cms are all null, so don't print # GBM test**************************************** predictKey = 'Predict.hex' start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=testKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "FIX! where do we get the summary info on the test data after predict?"
def test_GBM_poker_1m(self): for trial in range(2): # PARSE train**************************************** h2o.beta_features = False #turn off beta_features start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' timeoutSecs = 900 # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = 'poker/poker-hand-testing.data' hex_key = 'poker-hand-testing.data.hex' parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True ntrees = 2 for max_depth in [5,10,20]: params = { 'learn_rate': .1, 'nbins': 10, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': num_cols-1, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if DO_PLOT_IF_KEVIN: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. # PARSE **************************************** print "Parse starting: " + csvFilename # hex_key = csvFilename + "_" + str(trial) + ".hex" hex_key = "C" + str(trial) start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) # GBM **************************************** if not DO_GBM: continue # make col 2 a binomial (negative numbers in src col = 2 execExpr = "%s[,%s] = (%s[,%s]>-7 ? 1 : 0))" % (hex_key, col, hex_key, col) resultExec = h2o_cmd.runExec(str=execExpr) params = { 'destination_key': "GBMKEY", 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': col # should be binomial from above } kwargs = params.copy() h2o.beta_features = True timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) # wait for it to show up in jobs? time.sleep(2) # no pattern waits for all h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%f pct. of timeout" % (GBMResult['python_%timeout']) print "\nGBMResult:", GBMResult # print "\nGBMResult:", h2o.dump_json(GBMResult) h2o.beta_features = False h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_at_all_nodes()
def test_GBM_many_cols_enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), # (10000, 3000, 'cK', 3600), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** ntrees = 10 for max_depth in [5,10,20,40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(numCols-1), 'ignored_cols_by_name': None, } # both response variants should work? # if random.randint(0,1): # params['response'] = numCols-1, print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # just plot the last one if 1==1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_with_cancels(self): print "Sets h2o.beta_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {'destination_key': 'c.hex'} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult['destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath=='manyfiles-nflx-gz': if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1) kwargs = { 'str': execExpr } resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) x = range(542) # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]: if i not in x: print "x:", x print 'missing?', i x.remove(i) xIgnore.append(i) x = ",".join(map(str,x)) def colIt(x): return "C" + str(x) xIgnore = ",".join(map(colIt, xIgnore)) else: # leave one col ignored, just to see? xIgnore = 0 modelKey = "GBMGood" params = { 'destination_key': modelKey, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all for i in range(20): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): kwargs['destination_key'] = 'GBMBad' + str(j) GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) jobids.append(GBMFirstResult['job_key']) # have to pass the job id for j in jobids: h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs']) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def doGBM(f, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row): debug = False bench = "bench" if debug: print "Doing GBM DEBUG" bench = "bench/debug" #date = '-'.join([str(x) for x in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' gbmbenchcsv = 'benchmarks/'+build+'/'+pre+'gbmbench.csv' if not os.path.exists(gbmbenchcsv): output = open(gbmbenchcsv,'w') output.write(','.join(csv_header)+'\n') else: output = open(gbmbenchcsv,'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() h2o.beta_features = False #ensure this is false! if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets', path = csvPathname, schema = 'local', hex_key = hex_key, header = 1, header_from_file = headerKey, separator = 44, timeoutSecs = 16000, retryDelaySecs = 5, pollTimeoutSecs = 16000, noPoll = True, doSummary = False ) h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=16000, retryDelaySecs=5) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime ," seconds." h2o.beta_features = False #make sure false for the inspect as well! inspect_train = h2o.nodes[0].inspect(hex_key, timeoutSecs=16000) inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=16000) h2o.beta_features = True #ok, can be true again nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row.update( {'h2o_build' : build, 'nMachines' : nMachines, 'nJVMs' : len(h2o.nodes), 'Xmx/JVM' : java_heap_GB, 'dataset' : f, 'nTrainRows' : inspect_train['num_rows'], 'nTestRows' : inspect_test['num_rows'], 'nCols' : inspect_train['num_cols'], 'trainParseWallTime' : parseWallTime, 'nTrees' : ntrees, 'minRows' : minrows, 'maxDepth' : depth, 'learnRate' : learnRate, 'classification' : classification, }) params = {'destination_key' : 'GBM('+f+')', 'response' : response, 'ignored_cols_by_name' : ignored_cols, 'classification' : classification, 'validation' : testFilehex, 'ntrees' : ntrees, 'max_depth' : depth, 'min_rows' : minrows, 'nbins' : nbins, 'learn_rate' : learnRate, } parseResult = {'destination_key' : hex_key} kwargs = params.copy() gbmStart = time.time() #TODO(spencer): Uses jobs to poll for gbm completion gbm = h2o_cmd.runGBM(parseResult = parseResult, noPoll=True, timeoutSecs=4800, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=120, retryDelaySecs=5) gbmTime = time.time() - gbmStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) row.update( {'gbmBuildTime' : gbmTime, }) gbmTrainView = h2o_cmd.runGBMView(model_key='GBM('+f+')') if classification: cm = gbmTrainView['gbm_model']['cm'] err = 1.0*(cm[0][1] + cm[1][0]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]) else: err = gbmTrainView['gbm_model']['errs'][-1] row.update({'Error' : err}) csvWrt.writerow(row) finally: output.close()
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row): bench = "bench" if debug: print "Doing GBM DEBUG" bench = "bench/debug" date = '-'.join([str(x) for x in list(time.localtime())][0:3]) for f in fs['train']: overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv' if not os.path.exists(gbmbenchcsv): output = open(gbmbenchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(gbmbenchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x', 'CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=5, pollTimeoutSecs=7200) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime, " seconds." inspect_train = h2o.nodes[0].inspect( parseResult['destination_key']) inspect_test = h2o.nodes[0].inspect(testFilehex) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len( h2o_hosts.hosts) row.update({ 'h2o_build': build, 'nMachines': nMachines, 'nJVMs': len(h2o.nodes), 'Xmx/JVM': java_heap_GB, 'dataset': f, 'nTrainRows': inspect_train['numRows'], 'nTestRows': inspect_test['numRows'], 'nCols': inspect_train['numCols'], 'trainParseWallTime': parseWallTime, 'classification': classification, }) params = { 'destination_key': 'GBM(' + f + ')', 'response': response, 'ignored_cols_by_name': ignored_cols, 'classification': classification, 'validation': testFilehex, 'ntrees': ntrees, 'max_depth': depth, 'min_rows': minrows, 'nbins': nbins, 'learn_rate': learnRate, } kwargs = params.copy() gbmStart = time.time() #TODO(spencer): Uses jobs to poll for gbm completion h2o.beta_features = True gbm = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=4800, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=120, retryDelaySecs=5) h2o.beta_features = False gbmTime = time.time() - gbmStart row.update({ 'gbmBuildTime': gbmTime, }) #TODO(spencer): Add in gbm scoring #gbmScoreStart = time.time() #gbmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key']) #scoreTime = time.time() - gbmScoreStart csvWrt.writerow(row) finally: output.close()
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x+1), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % "C" + str(response+1) ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response+1), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response+1), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_covtype_train_test(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) ntrees = 2 # fails with 40 for max_depth in [40, 5]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if h2o.localhost: tryList = [ (10000, 100, 'cA', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, 'cD', 300), (10000, 200, 'cE', 300), (10000, 300, 'cF', 300), (10000, 400, 'cG', 300), (10000, 500, 'cH', 300), (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' hdrFilename = 'hdr_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) # hack elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** ntrees = 5 prefixList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] # for max_depth in [5,10,20,40]: for max_depth in [5, 10, 20]: # PARSE a new header**************************************** print "Creating new header", hdrPathname prefix = prefixList.pop(0) write_syn_header(hdrPathname, rowCount, colCount, prefix) # upload and parse the header to a hex hdr_hex_key = prefix + "_hdr.hex" parseHdrResult = h2i.import_parse(bucket=None, path=hdrPathname, schema='put', header=1, # REQUIRED! otherwise will interpret as enums hex_key=hdr_hex_key, timeoutSecs=timeoutSecs, doSummary=False) # Set Column Names (before autoframe is created) h2o.nodes[0].set_column_names(source=hex_key, copy_from=hdr_hex_key) # GBM print "response col name is changing each iteration: parsing a new header" params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': prefix + "_response", 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # works if you delete the autoframe ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe') # just plot the last one if DO_PLOT: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 # response = 378 response = 'C379' # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBMGrid_basic_benign(self): h2o.beta_features = True csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" # check the first in the models list. It should be the best colNames = [ 'STR', 'OBS', 'AGMT', 'FNDX', 'HIGD', 'DEG', 'CHK', 'AGP1', 'AGMN', 'NLV', 'LIV', 'WT', 'AGLP', 'MST' ] modelKey = 'GBMGrid_benign' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'STR', 'learn_rate': '.1,.2', 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': 'FNDX', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs) if not DO_POLL: # no pattern waits for all print "\nfirst GBMResult:", h2o.dump_json(GBMResult) h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey) if 1 == 0: # FIX! get model? gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs'])