def test_rf_covtype_train_full_fvec(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = kwargs['ntrees'] * 60 start = time.time() print "Note train.csv is used for both train and validation" rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) job_key = rfv['job_key'] model_key = rfv['destination_key'] rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1, print_params=True) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) self.assertLess(classification_error, 3, "train.csv should have full classification error: %s < 3" % classification_error) print "Trial #", trial, "completed"
def test_rf_covtype_train_full_fvec(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = kwargs['ntrees'] * 60 start = time.time() print "Note train.csv is used for both train and validation" rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) job_key = rfv['job_key'] model_key = rfv['destination_key'] rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) # hmm..just using defaults above in RF? self.assertLess(classification_error, 4.8, "train.csv should have full classification error: %s < 4.8" % classification_error) print "Trial #", trial, "completed"
def completionHack(jobKey, modelKey): if DO_POLL: # not needed pass else: h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # print "FIX! how do we get the GLM result" params = {'_modelKey': modelKey} a = h2o.nodes[0].completion_redirect(jsonRequest="2/GLMModelView.json", params=params)
def test_GBM_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilename = "mnist_training.csv.gz" timeoutSecs = 1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=importFolderPath + "/" + csvFilename, schema="put", hex_key=trainKey, timeoutSecs=timeoutSecs, ) elapsed = time.time() - start print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GBM (train)**************************************** modelKey = "GBM_model" params = { "classification": 1, # faster? "destination_key": modelKey, "learn_rate": 0.1, "ntrees": 3, "max_depth": 8, "min_rows": 1, "response": 0, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() timeoutSecs = 1800 # noPoll -> False when GBM finished start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView["gbm_model"]["errs"][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView["gbm_model"]["cms"][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView["gbm_model"]["errs"])
def test_GBM_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilename = "mnist_training.csv.gz" timeoutSecs=1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** modelKey = "GBM_model" params = { 'classification': 1, # faster? 'destination_key': modelKey, 'learn_rate': .1, 'ntrees': 3, 'max_depth': 8, 'min_rows': 1, 'response': 0, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() timeoutSecs = 1800 #noPoll -> False when GBM finished start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cms = gbmTrainView['gbm_model']['cms'] cm = cms[-1]['_arr'] # use the last one print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr'] print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_GBMGrid_basic_prostate(self): h2o.beta_features = True csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') colNames = ['ID','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] modelKey = 'GBMGrid_prostate' # 'cols', 'ignored_cols_by_name', and 'ignored_cols' have to be exclusive params = { 'destination_key': modelKey, 'ignored_cols_by_name': 'ID', 'learn_rate': .1, 'ntrees': '4,100', 'max_depth': 8, 'min_rows': 1, 'response': 'CAPSULE', 'classification': 1 if DO_CLASSIFICATION else 0, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=not DO_POLL, **kwargs) if not DO_POLL: print "\nfirst GBMResult:", h2o.dump_json(GBMResult) statMean = h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." # FIX! after gbm grid, have to get the model keys from the json? gbmGridView = h2o.nodes[0].gbm_grid_view(job_key=GBMResult['job_key'], destination_key=modelKey) print h2o.dump_json(gbmGridView) if 1==0: gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])
def test_c9b_GBM_airlines_hdfs(self): h2o.beta_features = True files = [ ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed') ] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** # passes 5, fails 15 # for depth in [5,15,25,40]: for depth in [5,5,5,5,5]: params = { 'destination_key': "GBMKEY", 'learn_rate': .2, 'nbins': 1024, 'ntrees': 10, 'max_depth': depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } print "Using these parameters for GBM: ", params kwargs = params.copy() start = time.time() print "Start time is: ", time.time() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,timeoutSecs=timeoutSecs,**kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) h2j.pollWaitJobs(pattern="GBMKEY",timeoutSecs=1800,pollTimeoutSecs=1800) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') #print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_c9_GBM_airlines_hdfs(self): h2o.beta_features = True files = [("datasets", "airlines_all.csv", "airlines_all.hex", 1800, "IsDepDelayed")] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema="hdfs", hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GBM (train)**************************************** for depth in [5, 15]: params = { "destination_key": "GBMKEY", "learn_rate": 0.2, "nbins": 1024, "ntrees": 10, "max_depth": depth, "min_rows": 10, "response": response, "ignored_cols_by_name": "CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed", } print "Using these parameters for GBM: ", params kwargs = params.copy() timeoutSecs = 1800 start = time.time() print "Start time is: ", time.time() # noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = (statMean["num_cpus"],) my_cpu_pct = (statMean["my_cpu_%"],) sys_cpu_pct = (statMean["sys_cpu_%"],) system_load = statMean["system_load"] # shouldn't need this? h2j.pollWaitJobs( pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5 ) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename # GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') # print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_rf_covtype_train_full_fvec(self): h2o.beta_features = True csvFilename = "covtype.data" csvPathname = "standard/" + csvFilename parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=180 ) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = kwargs["ntrees"] * 60 start = time.time() print "Note train.csv is used for both train and validation" rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5) elapsed = time.time() - start print "RF end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) job_key = rfv["job_key"] model_key = rfv["destination_key"] rfv = h2o_cmd.runRFView( data_key=parseResult["destination_key"], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1 ) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) # hmm..just using defaults above in RF? self.assertLess( classification_error, 4.8, "train.csv should have full classification error: %s < 4.8" % classification_error, ) print "Trial #", trial, "completed"
def test_GBM_manyfiles_multijob(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) # inc by 1 for R col # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate # only a problem if they share the dataset, do classification with integers. # change to factor here, to avoid the problem execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** csvPathname = importFolderPath + "/" + testFilename parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! # plus 1 for R indexing execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] # add 1 for start-with-1 ignored_cols_by_name = ",".join( map(lambda x: "C" + str(x + 1), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % 'C' + str( response + 1) ntrees = 10 trial = 0 # ignore 200 random cols (not the response) print "Kicking off multiple GBM jobs at once" # GBM train**************************************** if DO_FAIL: cases = [5, 10, 20, 40] else: cases = [5, 10, 20] for max_depth in cases: trial += 1 params = { 'response': "C" + str(response + 1), 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'validation': parseTestResult['destination_key'], 'ignored_cols_by_name': ignored_cols_by_name, 'grid_parallelism': 1, 'classification': 1 if DO_CLASSIFICATION else 0, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() # can take 4 times as long with 4 jobs? gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs * 4, destination_key=modelKey + "_" + str(trial), **kwargs) trainElapsed = time.time() - trainStart print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
def test_PCA_ignore_enums_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 3, 'cA', 300), # (10001, 2, 'cA', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** start = time.time() modelKey = 'PCAModelKey' # Parse **************************************** parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # PCA(tolerance iterate)**************************************** for tolerance in [i/10.0 for i in range(11)]: params = { 'ignored_cols': 'C1', 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } print "Using these parameters for PCA: ", params kwargs = params.copy() PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) print "Checking PCA results: " pcaView = h2o_cmd.runPCAView(modelKey = modelKey) h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def test_c7_rel(self): print "Running with h2o.beta_features=True for all" h2o.beta_features = True print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset # does the json fail with too many?? #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2) # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500) # can't do more than 1000 summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=numCols, numRows=numRows) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect if DO_INSPECT: x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x else: x = None kwargs = { # 'x': x, 'response': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 4, # 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) # can't figure out how I'm supposed to get the model # GLMModel = glm['GLMModel'] # modelKey = GLMModel['model_key'] # glmView = h2o.nodes[0].glm_view(modelKey=modelKey) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_GBM_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilename = "mnist_training.csv.gz" timeoutSecs = 1800 trialStart = time.time() # PARSE train**************************************** trainKey = csvFilename + "_" + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilename, schema='put', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** modelKey = "GBM_model" params = { 'classification': 1, # faster? 'destination_key': modelKey, 'learn_rate': .1, 'ntrees': 3, 'max_depth': 8, 'min_rows': 1, 'response': 0, # this dataset has the response in the last col (0-9 to check) # 'ignored_cols_by_name': range(200,784) # only use the first 200 for speed? } kwargs = params.copy() timeoutSecs = 1800 #noPoll -> False when GBM finished start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cms = gbmTrainView['gbm_model']['cms'] cm = cms[-1]['_arr'] # use the last one print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr'] print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json( gbmTrainView['gbm_model']['errs'])
def test_c9_GBM_airlines_hdfs(self): h2o.beta_features = True files = [('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')] for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files: # PARSE train**************************************** csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GBM (train)**************************************** for depth in [5, 15]: params = { 'destination_key': "GBMKEY", 'learn_rate': .2, 'nbins': 1024, 'ntrees': 10, 'max_depth': depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed' } print "Using these parameters for GBM: ", params kwargs = params.copy() timeoutSecs = 1800 start = time.time() print "Start time is: ", time.time() #noPoll -> False when GBM finished GBMResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=timeoutSecs, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern="GBMKEY", timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5) print "Finished time is: ", time.time() elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds. On dataset: ", csvFilename #GBMView = h2o_cmd.runGBMView(model_key='GBMKEY') #print GBMView['gbm_model']['errs'] h2i.delete_keys_at_all_nodes(timeoutSecs=600)
def test_GLM2_mnist(self): if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTestResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTrainResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % ( trainKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % ( testKey, y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr = "B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] # This seems wrong..what's the format of the cm? cm = glm['glm_model']['submodels'][0]['validation']['_cms'][ -1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y + 1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 9, "Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_RF_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" # x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, schema='local', hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) params = { 'response': 'C' + str(y+1), 'cols': None, 'ignored_cols_by_name': ignore_x, 'classification': 1, 'validation': None, 'ntrees': 2, 'max_depth': 20, 'min_rows': None, 'nbins': 1000, 'mtries': None, 'sample_rate': 0.66, 'seed': None, } rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow params['destination_key'] = 'RFModel_' + str('jobDispatch') kwargs = params.copy() timeoutSecs = 1200 start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, rfView=DO_POLL, **kwargs) elapsed = time.time() - start # print h2o.dump_json(rfResult) print "rf job dispatch end on ", trainCsvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch # FIX! are these already in there? rfView = {} rfView['data_key'] = trainKey2 rfView['model_key'] = kwargs['destination_key'] rfView['ntrees'] = kwargs['ntrees'] rfViewInitial.append(rfView) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) # FIX! need to add the rfview and predict stuff # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntrees = rfView['ntrees'] rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, noPoll=not DO_POLL, doSimpleCheck=False) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) self.assertAlmostEqual(classification_error, 10, delta=2, msg="Classification error %s differs too much" % classification_error) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=5) # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False) # print "rfView:", h2o.dump_json(rfView) # "N":1, # "errs":[0.25,0.1682814508676529], # "testKey":"syn_binary_10000x10.hex", # "cm":[[3621,1399],[1515,3465]]}} rf_model = rfView['drf_model'] cms = rf_model['cms'] errs = rf_model['errs'] # FIX! should update this expected classification error ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
def test_GBM_manyfiles_multijob(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! # execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) # inc by 1 for R col # BUG: if left as integer..GBM changes to Enum. multiple jobs collide on this translate # only a problem if they share the dataset, do classification with integers. # change to factor here, to avoid the problem execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** csvPathname = importFolderPath + "/" + testFilename parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! # plus 1 for R indexing execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) if not DO_FAIL: execExpr += "; factor(%s[, 378+1]);" % (testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: "C" + str(x), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 trial = 0 # ignore 200 random cols (not the response) print "Kicking off multiple GBM jobs at once" # GBM train**************************************** if DO_FAIL: cases = [5, 10, 20, 40] else: cases = [5, 10, 20] for max_depth in cases: trial += 1 params = { 'response': "C" + str(response), 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'validation': parseTestResult['destination_key'], 'ignored_cols_by_name': ignored_cols_by_name, 'grid_parallelism': 1, 'classification': 1 if DO_CLASSIFICATION else 0, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() # can take 4 times as long with 4 jobs? gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs * 4, destination_key=modelKey + "_" + str(trial), **kwargs) trainElapsed = time.time() - trainStart print "GBM dispatch completed in", trainElapsed, "seconds. On dataset: ", trainFilename statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
def test_c7_fvec(self): print "Since the python is not necessarily run as user=0xcust.." print "r can't use schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # apparently h2o will create a "_" to replace the "-"..so lets force the destination key name csvFilename = "part-00000b" hex_key = "part_00000b.hex" importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', separator=9, hex_key=hex_key, doSummary=False, timeoutSecs=500) print "Parse of", parseResult['destination_key'], "took", time.time( ) - start, "seconds" print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, hex_key, timeoutSecs=500) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset # does the json fail with too many?? #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2) # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500) # can't do more than 1000 summaryResult = h2o_cmd.runSummary(key=hex_key, numCols=numCols, numRows=numRows, timeoutSecs=500) # there may be a lot NAs. # we don't want to ignore any cols, and we don't want to ignore row # so impute to median # zero indexed column for column in range(numCols): print "Imputing any NAs in column %s to median" % column impResult = h2o.nodes[0].impute(source=hex_key, column=column, method='median') # check that there are no missing now inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect) if len(missingValuesList) != 0: raise Exception("Shouldn't be missing values after impute: %s" % missingValuesList) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=hex_key) print "x:", x kwargs = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, # 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) # can't figure out how I'm supposed to get the model # GLMModel = glm['GLMModel'] # modelKey = GLMModel['model_key'] # glmView = h2o.nodes[0].glm_view(modelKey=modelKey) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_RF_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" # x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, schema='local', hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) params = { 'response': 'C' + str(y), 'cols': None, 'ignored_cols_by_name': ignore_x, 'classification': 1, 'validation': None, 'ntrees': 10, 'max_depth': 20, 'min_rows': None, 'nbins': 1000, 'mtries': None, 'sample_rate': 0.66, 'seed': None, } rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow params['destination_key'] = 'RFModel_' + str('jobDispatch') kwargs = params.copy() timeoutSecs = 1200 start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, rfView=DO_POLL, **kwargs) elapsed = time.time() - start # print h2o.dump_json(rfResult) print "rf job dispatch end on ", trainCsvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # FIX! are these already in there? rfView = {} rfView['data_key'] = trainKey2 rfView['model_key'] = kwargs['destination_key'] rfView['ntrees'] = kwargs['ntrees'] rfViewInitial.append(rfView) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) # FIX! need to add the rfview and predict stuff # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntrees = rfView['ntrees'] rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, noPoll=not DO_POLL, doSimpleCheck=False) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) self.assertAlmostEqual( classification_error, 10, delta=2, msg="Classification error %s differs too much" % classification_error) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=5) # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False) # print "rfView:", h2o.dump_json(rfView) # "N":1, # "errs":[0.25,0.1682814508676529], # "testKey":"syn_binary_10000x10.hex", # "cm":[[3621,1399],[1515,3465]]}} rf_model = rfView['drf_model'] cms = rf_model['cms'] ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] # FIX! should update this expected classification error ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
def test_GLM2_mnist_short(self): importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" # first col is pixel value ..use 0 here y = 0 ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTestResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTrainResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, # first column is pixel value 'response': 'C' + str(y+1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) execExpr="A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o_cmd.runSummary(key=trainKey, cols=0, max_ncols=1, noPrint=False) h2o_cmd.runSummary(key='A.hex', cols=0, max_ncols=1, noPrint=False) execExpr="B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="B.hex[,%s]=(B.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o_cmd.runSummary(key=testKey, cols=0, max_ncols=1, noPrint=False) h2o_cmd.runSummary(key='B.hex', cols=0, max_ncols=1, noPrint=False) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm)
def test_c7_fvec(self): print "Since the python is not necessarily run as user=0xcust.." print "r can't use schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' hex_key = csvFilename = ".hex" importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, hex_key, timeoutSecs=500) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset # does the json fail with too many?? #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2) # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500) # can't do more than 1000 summaryResult = h2o_cmd.runSummary(key=hex_key, numCols=numCols, numRows=numRows, timeoutSecs=500) # there may be a lot NAs. # we don't want to ignore any cols, and we don't want to ignore row # so impute to median # zero indexed column for column in range(numCols): print "Imputing any NAs in column %s to median" % column impResult = h2o.nodes[0].impute(source=hex_key, column=column, method='median') # check that there are no missing now inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect) if len(missingValuesList)!=0: raise Exception ("Shouldn't be missing values after impute: %s" % missingValuesList) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=hex_key) print "x:", x kwargs = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, # 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) # can't figure out how I'm supposed to get the model # GLMModel = glm['GLMModel'] # modelKey = GLMModel['model_key'] # glmView = h2o.nodes[0].glm_view(modelKey=modelKey) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_c10_rel_gbm(self): h2o.beta_features = True print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Test*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' testFilename = 'classification1Test.txt' testPathname = importFolderPath + "/" + testFilename start = time.time() parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' trainFilename = 'classification1Train.txt' trainPathname = importFolderPath + "/" + trainFilename start = time.time() parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500) print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, trainPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # keepList = [] # h2o_glm.findXFromColumnInfo(key=parseTrainResult['destination_key'], keepList=keepList) # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices # GBM Train*********************************************************** x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] # response = 0 # doesn't work if index is used? response = 'outcome' # x = range(inspect['num_cols']) # del x[response] ntrees = 100 # fails with 40 params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': 20, 'min_rows': 2, 'response': response, 'cols': x, # 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() modelKey = 'GBMModelKey' timeoutSecs = 900 trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM2_mnist(self): h2o.beta_features = True if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, 'response': 'C' + str(y), # 'case_mode': '=', # 'case_val': 0, 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, ## 'weight': 1.0, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } if DO_ALL_DIGITS: cases = [0,1,2,3,4,5,6,7,8,9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) if DO_BUG: execExpr="A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (trainKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr="A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) if DO_BUG: execExpr="B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (testKey, y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) else: execExpr="B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr="B.hex[,%s]=(B.hex[,%s]==%s)" % (y+1, y+1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_selfKey'] # This seems wrong..what's the format of the cm? if 1==0: cm = glm['glm_model']['submodels'][0]['validation']['_cms'][0]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key='B.hex', model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual='B.hex', vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM2_mnist_short(self): h2o.beta_features = True importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTestResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTestResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" # first col is pixel value ..use 0 here y = 0 ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTestResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseTrainResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo( y, key=parseTrainResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX modelKey = 'GLM_model' params = { 'ignored_cols': ignoreX, # first column is pixel value 'response': 'C' + str(y + 1), 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 15, ## 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, 'destination_key': modelKey, } cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c # kwargs['case_val'] = c # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) execExpr = "A.hex=%s" % (trainKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o_cmd.runSummary(key=trainKey, cols=0, max_ncols=1, noPrint=False) h2o_cmd.runSummary(key='A.hex', cols=0, max_ncols=1, noPrint=False) execExpr = "B.hex=%s" % (testKey) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1, c) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) h2o_cmd.runSummary(key=testKey, cols=0, max_ncols=1, noPrint=False) h2o_cmd.runSummary(key='B.hex', cols=0, max_ncols=1, noPrint=False) timeoutSecs = 1800 start = time.time() aHack = {'destination_key': 'A.hex'} glmFirstResult = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) print "\nglmFirstResult:", h2o.dump_json(glmFirstResult) job_key = glmFirstResult['job_key'] h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=5) # double check...how come the model is bogus? h2o_jobs.pollWaitJobs() glm = h2o.nodes[0].glm_view(_modelKey=modelKey) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) modelKey = glm['glm_model']['_key'] cm = glm['glm_model']['submodels'][0]['validation']['_cms'][ -1]['_arr'] print "cm:", cm pctWrong = h2o_gbm.pp_cm_summary(cm) # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm)
def test_c7_rel(self): print "Running with h2o.beta_features=True for all" h2o.beta_features = True print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" csvFilename = 'part-00000b' importFolderPath = '/mnt/0xcustomer-datasets/c2' csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False) print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds" print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] # do summary of the parsed dataset last, since we know it fails on this dataset # does the json fail with too many?? #summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2) # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], max_ncols=2500) # can't do more than 1000 summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=numCols, numRows=numRows) keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'response': y, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 10, # 'thresholds': 0.5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 3600 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, noPoll=True, **kwargs) statMean = h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) num_cpus = statMean['num_cpus'], my_cpu_pct = statMean['my_cpu_%'], sys_cpu_pct = statMean['sys_cpu_%'], system_load = statMean['system_load'] # shouldn't need this? h2j.pollWaitJobs(pattern=None, timeoutSecs=timeoutSecs, pollTimeoutSecs=30, retryDelaySecs=5) # can't figure out how I'm supposed to get the model # GLMModel = glm['GLMModel'] # modelKey = GLMModel['model_key'] # glmView = h2o.nodes[0].glm_view(modelKey=modelKey) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_c10_rel_gbm(self): h2o.beta_features = True print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" # Parse Test*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' testFilename = 'classification1Test.txt' testPathname = importFolderPath + "/" + testFilename start = time.time() parseTestResult = h2i.import_parse(path=testPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTestResult['destination_key'], "took", time.time() - start, "seconds" # Parse Train*********************************************************** importFolderPath = '/mnt/0xcustomer-datasets/c3' trainFilename = 'classification1Train.txt' trainPathname = importFolderPath + "/" + trainFilename start = time.time() parseTrainResult = h2i.import_parse(path=trainPathname, schema='local', timeoutSecs=500, doSummary=True) print "Parse of", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" start = time.time() inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'], timeoutSecs=500) print "Inspect:", parseTrainResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, trainPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] # do summary of the parsed dataset last, since we know it fails on this dataset summaryResult = h2o_cmd.runSummary(key=parseTrainResult['destination_key']) h2o_cmd.infoFromSummary(summaryResult, noPrint=False) # GBM Train*********************************************************** x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70] # response = 0 # doesn't work if index is used? response = 'outcome' # x = range(inspect['num_cols']) # del x[response] ntrees = 10 # fails with 40 params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': 20, 'min_rows': 2, 'response': response, 'cols': x, # 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() modelKey = 'GBMModelKey' timeoutSecs = 900 trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast # get the last cm cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)