def notest_GenParity1(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange(10, 20, 10): shCmdString = "perl " + parityPl + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = "1,2,3,4,5,6" timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange(10, 20, 10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runSpeeDRF(parseResult=parseResult, response=8, ntrees=trees, timeoutSecs=timeoutSecs) timeoutSecs += 2
def test_speedrf_params_rand2_fvec(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' hex_key = 'covtype.data.hex' for trial in range(10): # params is mutable. This is default. # response is required for SpeeERF params = { 'response': 'C55', 'ntrees': 1, 'mtries': 7, 'balance_classes': 0, # never run with unconstrained balance_classes size if random sets balance_classes..too slow 'max_after_balance_size': 2, 'importance': 0} colX = h2o_util.pickRandParams(paramDict, params) if 'cols' in params and params['cols']: # exclusion if 'ignored_cols_by_name' in params: params['ignored_cols_by_name'] = None else: if 'ignored_cols_by_name' in params and params['ignored_cols_by_name']: params['mtries'] = random.randint(1,53) else: params['mtries'] = random.randint(1,54) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 80 + ((kwargs['ntrees']*80) * max(1,kwargs['mtries']/60) ) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key) h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def notest_RF_poker100(self): h2o.beta_features = True trees = 6 timeoutSecs = 20 csvPathname = "poker/poker100" parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="put") h2o_cmd.runSpeeDRF(parseResult=parseResult, num_trees=trees, timeoutSecs=timeoutSecs)
def notest_RF_iris2(self): h2o.beta_features = True trees = ",".join(map(str,range(1,4))) timeoutSecs = 20 csvPathname = 'iris/iris2.csv' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange(10,100,10): shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange(10,100,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runSpeeDRF(parseResult=parseResult, response=8, ntrees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def test_RF_poker100(self): MISSING_RESPONSE = True trees = ",".join(map(str, range(1, 4))) trees = "1,2" timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') jobs = [] for i in range(1): if MISSING_RESPONSE: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) else: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs) job_key = rfResult['job_key'] model_key = rfResult['destination_key'] jobs.append((job_key, model_key)) h2o_jobs.pollWaitJobs(timeoutSecs=300) for job_key, model_key in jobs: gridResult = h2o.nodes[0].speedrf_grid_view( job_key=job_key, destination_key=model_key) # h2o_rf.showRFGridResults(GBMResult, 15) print "speedrf grid result for %s:", model_key, h2o.dump_json( gridResult)
def test_RF_poker100(self): MISSING_RESPONSE = False DO_MODEL_INSPECT = False trees = ",".join(map(str,range(10,50,2))) timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') jobs = [] for i in range(1): if MISSING_RESPONSE: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) else: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs) job_key = rfResult['job_key'] model_key = rfResult['destination_key'] jobs.append( (job_key, model_key) ) h2o_jobs.pollWaitJobs(timeoutSecs=300) for job_key, model_key in jobs: gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key) print "speedrf grid result for %s:", h2o.dump_json(gridResult) print "speedrf grid result errors:", gridResult['prediction_errors'] for i,j in enumerate(gridResult['jobs']): if DO_MODEL_INSPECT: print "\nspeedrf result %s:" % i, h2o.dump_json(h2o_cmd.runInspect(key=j['destination_key'])) else: # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key']) model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key']) print "model:", h2o.dump_json(model)
def notest_RF_poker100(self): trees = 6 timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
def notest_RF_iris2(self): trees = ",".join(map(str, range(1, 4))) timeoutSecs = 20 csvPathname = 'iris/iris2.csv' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
def notest_RF_iris2(self): h2o.beta_features = True trees = 6 timeoutSecs = 20 csvPathname = 'iris/iris2.csv' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs)
def doBoth(): h2o.verboseprint("Trial", trial) start = time.time() # make sure ntrees and max_depth are the same for both rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed1 = time.time() - start (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed2 = time.time() - start (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) print "Checking that results are similar (within 20%)" print "DRF2 then SpeeDRF" print "per-class variance is large..basically we can't check very well for this dataset" for i, (j,k) in enumerate(zip(classErrorPctList1, classErrorPctList2)): print "classErrorPctList[%s]:i %s %s" % (i, j, k) # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], # delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i) print "totalError: %s %s" % (totalError1, totalError2) self.assertAlmostEqual(totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF") print "elapsed: %s %s" % (elapsed1, elapsed2) self.assertAlmostEqual(elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")
def test_speedrf_params_rand2_fvec(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' hex_key = 'covtype.data.hex' for trial in range(10): # params is mutable. This is default. # response is required for SpeeERF params = { 'response': 'C55', 'ntrees': 1, 'mtries': 7, 'balance_classes': 0, # never run with unconstrained balance_classes size if random sets balance_classes..too slow 'max_after_balance_size': 2, 'importance': 0 } colX = h2o_util.pickRandParams(paramDict, params) if 'cols' in params and params['cols']: # exclusion if 'ignored_cols_by_name' in params: params['ignored_cols_by_name'] = None else: if 'ignored_cols_by_name' in params and params[ 'ignored_cols_by_name']: params['mtries'] = random.randint(1, 53) else: params['mtries'] = random.randint(1, 54) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 80 + ( (kwargs['ntrees'] * 80) * max(1, kwargs['mtries'] / 60)) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key) h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs)
def test_rf_float_bigexp_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 1000 colCount = 7 write_syn_dataset(csvPathname, totalRows, colCount, headerData) for trial in range(5): # grow the data set rowData = rand_rowData(colCount) num = random.randint(4096, 10096) append_syn_dataset(csvPathname, colCount, num) totalRows += num # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'response': 'AGE', 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True) start = time.time() rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=15, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, ntree=ntree) inspect = h2o_cmd.runInspect(key=hex_key) cols = inspect['cols'] #num_cols = inspect['num_cols'] #for i,c in enumerate(cols): # if i < (num_cols-1): # everything except the last col (output) should be 8 byte float # colType = c['type'] # self.assertEqual(colType, 'float', msg="col %d should be type Real: %s" % (i, colType)) h2o.check_sandbox_for_errors()
def test_RF_poker100(self): trees = ",".join(map(str,range(1,4))) timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) job_key = rfResult['job_key'] model_key = rfResult['destination_key'] gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key) print "speedrf grid result:", h2o.dump_json(gridResult)
def test_RF_poker100(self): MISSING_RESPONSE = False DO_MODEL_INSPECT = False trees = ",".join(map(str, range(10, 50, 2))) timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') jobs = [] for i in range(1): if MISSING_RESPONSE: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) else: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs) job_key = rfResult['job_key'] model_key = rfResult['destination_key'] jobs.append((job_key, model_key)) h2o_jobs.pollWaitJobs(timeoutSecs=300) for job_key, model_key in jobs: gridResult = h2o.nodes[0].speedrf_grid_view( job_key=job_key, destination_key=model_key) print "speedrf grid result for %s:", h2o.dump_json(gridResult) print "speedrf grid result errors:", gridResult[ 'prediction_errors'] for i, j in enumerate(gridResult['jobs']): if DO_MODEL_INSPECT: print "\nspeedrf result %s:" % i, h2o.dump_json( h2o_cmd.runInspect(key=j['destination_key'])) else: # model = h2o.nodes[0].speedrf_view(modelKey=j['destination_key']) model = h2o.nodes[0].speedrf_view( modelKey=j['destination_key']) print "model:", h2o.dump_json(model)
def test_rf_float_bigexp_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 1000 colCount = 7 write_syn_dataset(csvPathname, totalRows, colCount, headerData) for trial in range (5): # grow the data set rowData = rand_rowData(colCount) num = random.randint(4096, 10096) append_syn_dataset(csvPathname, colCount, num) totalRows += num # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'response': 'AGE', 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True) start = time.time() rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=15, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, ntree=ntree) inspect = h2o_cmd.runInspect(key=hex_key) cols = inspect['cols'] #num_cols = inspect['num_cols'] #for i,c in enumerate(cols): # if i < (num_cols-1): # everything except the last col (output) should be 8 byte float # colType = c['type'] # self.assertEqual(colType, 'float', msg="col %d should be type Real: %s" % (i, colType)) h2o.check_sandbox_for_errors()
def test_RF_poker100(self): MISSING_RESPONSE = True trees = ",".join(map(str,range(1,4))) trees = "1,2" timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') jobs = [] for i in range(1): if MISSING_RESPONSE: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) else: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, response='C11', ntrees=trees, timeoutSecs=timeoutSecs) job_key = rfResult['job_key'] model_key = rfResult['destination_key'] jobs.append( (job_key, model_key) ) h2o_jobs.pollWaitJobs(timeoutSecs=300) for job_key, model_key in jobs: gridResult = h2o.nodes[0].speedrf_grid_view(job_key=job_key, destination_key=model_key) # h2o_rf.showRFGridResults(GBMResult, 15) print "speedrf grid result for %s:", model_key, h2o.dump_json(gridResult)
def doBoth(): h2o.verboseprint("Trial", trial) start = time.time() # make sure ntrees and max_depth are the same for both rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed1 = time.time() - start (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed2 = time.time() - start (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) print "Checking that results are similar (within 20%)" print "DRF2 then SpeeDRF" print "per-class variance is large..basically we can't check very well for this dataset" for i, (j, k) in enumerate(zip(classErrorPctList1, classErrorPctList2)): print "classErrorPctList[%s]:i %s %s" % (i, j, k) # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], # delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i) print "totalError: %s %s" % (totalError1, totalError2) self.assertAlmostEqual( totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF") print "elapsed: %s %s" % (elapsed1, elapsed2) self.assertAlmostEqual( elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i == (TRIES - 1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) print "rfView:", h2o.dump_json(rfView) rfView["drf_model"] = rfView.pop("speedrf_model") rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual( len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict" ) # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_RF_mnist_both(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** # print "This is the 'ignore=' we'll use" # no longer use. depend on h2o to get it right. ntree = 25 params = { 'response': 0, 'ntrees': ntree, # 'data_key='mnist_training.csv.hex' 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 2147483647, 'select_stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample_rate': 0.67, 'oobee': 1, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'destination_key': 'RF_model', 'nbins': 1024, # 'seed': 784834182943470027, # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0, sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # RFView (score on test)**************************************** (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) # was 2.84 # sometimes get 2.87? self.assertAlmostEqual( classification_error, 1.6, delta=1.6, msg="Classification error %s differs too much" % classification_error) treeStats = rfView['speedrf_model']['treeStats'] leaves = { 'min': treeStats['minLeaves'], 'mean': treeStats['meanLeaves'], 'max': treeStats['maxLeaves'] } # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100 d = "seed: %s %s leaves: %s expected: %s pct. different %s" % ( params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = { 'min': treeStats['minDepth'], 'mean': treeStats['meanDepth'], 'max': treeStats['maxDepth'] } depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100 d = "seed: %s %s depth: %s expected: %s pct. different %s" % ( params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() modelKey = rfView['speedrf_model']['_key'] predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_rf_predict3_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() timeoutSecs = 600 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1==1: y = 4 # last col response = 'response' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'iris2.csv.hex' # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0} # No translate because we're using an Exec to get the data out?, and that loses the encoding? translate = None # one wrong will be 0.66667. I guess with random, that can happen? expectedPctWrong = 0.7 elif 1==0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7} expectedPctWrong = 0.7 elif 1==0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7} expectedPctWrong = 0.7 elif 1==0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7} expectedPctWrong = 0.7 else: y = 0 # first col response = 'C1' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist_training.hex' translate = { \ '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \ '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 } expectedPctWrong = 0.7 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)): raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o)!=str(p): if wrong==10: print "Not printing any more mismatches\n" elif wrong<10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 2.0: raise Exception("pctWrong too high. Expect < 2% error because it's reusing training data") return pctWrong #***************************************************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) kwargs = { 'destination_key': 'rf_model', 'response': response, 'ntrees': trees, 'classification': 1, } rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) rfResult["drf_model"] = rfResult.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" print "y=", y pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) # can be zero if memorized (iris is either 0 or 0.667?) # just make delta 0.7 for now self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 0.7, msg="predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
def test_rf_big1_nopoll_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} model_key = "" if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'SRF_model' else: model_key = 'SRF_model' + str(jobDispatch) kwargs['ntrees'] = 1 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntrees'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) kwargs['response'] = "C107" # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=False, **kwargs) print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch print "\n MODEL KEY: ", model_key rfViewInitial.append(h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)) # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) model_key = rfView["speedrf_model"]['_key'] ntree = rfView["speedrf_model"]["parameters"]['ntrees'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_rf_big1_nopoll_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} model_key = "" if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'SRF_model' else: model_key = 'SRF_model' + str(jobDispatch) kwargs['ntrees'] = 1 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntrees'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) kwargs['response'] = "C107" # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=False, **kwargs) print "rf job dispatch end on ", csvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch print "\n MODEL KEY: ", model_key rfViewInitial.append( h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)) # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) model_key = rfView["speedrf_model"]['_key'] ntree = rfView["speedrf_model"]["parameters"]['ntrees'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_speedrf_mnist(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("train.csv.gz", "test.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 784 # last column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) ntrees = 10 params = { 'response': y, 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'SpeeDRF_model', 'nbins': 1024, 'seed': 784834182943470027, 'oobee': 1, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) rfv["drf_model"] = rfv.pop("speedrf_model") h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] print "Total trees: ", used_trees print "On data key: ", data_key print "Produced model key: ", model_key
def test_rf_change_data_key_fvec(self): importFolderPath = 'standard' csvFilenameTrain = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseResult = parseResult # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. params = {'ntrees': 2, 'destination_key': 'RF_model'} # colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() kwargs["response"] = "C55" # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 100 start = time.time() h2o_cmd.runSpeeDRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=360, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntrees = kwargs['ntrees'] start = time.time() h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) # FIX! should update this expected classification error # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_change_data_key_fvec(self): importFolderPath = 'standard' csvFilenameTrain = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseResult = parseResult # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. params = { 'ntrees': 2, 'destination_key': 'RF_model' } # colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() kwargs["response"] = "C55" # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 100 start = time.time() h2o_cmd.runSpeeDRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=360, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntrees = kwargs['ntrees'] start = time.time() h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runSpeeDRFView(None, model_key,timeoutSecs) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) # FIX! should update this expected classification error # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_RF_many_cols_enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), (10000, 3000, 'cK', 3600), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'RFModelKey' # Parse (train)**************************************** start = time.time() parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # RF(train iterate)**************************************** ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'nbins': 1024, 'classification': 1, 'ntrees': ntrees, 'max_depth': max_depth, 'response': 'C' + str(numCols - 1), 'ignored_cols_by_name': None, } print "Using these parameters for RF: ", params kwargs = params.copy() trainStart = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str( max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) rfResult["drf_model"] = rfResult.pop("speedrf_model") errsLast = rfResult['drf_model']['errs'][-1] print "RF 'errsLast'", errsLast cm = rfResult['drf_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # just plot the last one if 1 == 1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i==(TRIES-1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) print "rfView:", h2o.dump_json(rfView) rfView["drf_model"] = rfView.pop("speedrf_model") rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict") # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_speedrf_mnist(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("train.csv.gz", "test.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 784 # last column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntrees = 10 params = { 'response': y, 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'SpeeDRF_model', 'nbins': 1024, 'seed': 784834182943470027, 'oobee': 1, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) rfv["drf_model"] = rfv.pop("speedrf_model") h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] print "Total trees: ", used_trees print "On data key: ", data_key print "Produced model key: ", model_key
def test_rf_enums_mappings(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' # limit depth and number of trees to accentuate the issue with categorical split decisions # use mtries so both look at all cols at every split? doesn't matter for speedrf # does speedrf try one more time? with 3 cols, mtries=2, so another try might # get a look at the missing col # does matter for drf2. does it "just stop" # trying mtries always looking at all columns or 1 col might be interesting if SPEEDRF: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'ntrees': 1, 'max_depth': 100, # 'oobee': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'mtries': COLS, } elif GBM: kwargs = { 'destination_key': modelKey, 'response': y, 'validation': scoreDataKey, 'seed': 123456789, # 'learn_rate': .1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'classification': 1, } else: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'nbins': 1024, 'mtries': COLS, } for r in range(2): start = time.time() if GBM: gbmResult = h2o_cmd.runGBM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "gbm end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' # print h2o.dump_json(gbmResult) (classification_error, classErrorPctList, totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult) elif SPEEDRF: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "speedrf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) else: rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_rf_enums_mappings(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (ROWS, COLS, "cI", 300), (ROWS, COLS, "cI", 300), (ROWS, COLS, "cI", 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = "2c" # comma colSepChar = colSepHexString.decode("hex") colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = "0a" # newline rowSepChar = rowSepHexString.decode("hex") print "rowSepChar:", rowSepChar csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv" csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset( csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN, ) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset( csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE, ) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse( path=csvScorePathname, schema="put", hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt ) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt ) print "Parse result['destination_key']:", parseResult["destination_key"] print "\n" + csvFilename ( missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict, ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True) y = colCount modelKey = "enums" # limit depth and number of trees to accentuate the issue with categorical split decisions # use mtries so both look at all cols at every split? doesn't matter for speedrf # does speedrf try one more time? with 3 cols, mtries=2, so another try might # get a look at the missing col # does matter for drf2. does it "just stop" # trying mtries always looking at all columns or 1 col might be interesting if SPEEDRF: kwargs = { "sample_rate": 0.999, "destination_key": modelKey, "response": y, "ntrees": 1, "max_depth": 100, # 'oobee': 1, "validation": hex_key, # 'validation': scoreDataKey, "seed": 123456789, "mtries": COLS, } elif GBM: kwargs = { "destination_key": modelKey, "response": y, "validation": scoreDataKey, "seed": 123456789, # 'learn_rate': .1, "ntrees": 1, "max_depth": 100, "min_rows": 1, "classification": 1, } else: kwargs = { "sample_rate": 0.999, "destination_key": modelKey, "response": y, "classification": 1, "ntrees": 1, "max_depth": 100, "min_rows": 1, "validation": hex_key, # 'validation': scoreDataKey, "seed": 123456789, "nbins": 1024, "mtries": COLS, } for r in range(2): start = time.time() if GBM: gbmResult = h2o_cmd.runGBM( parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs ) print "gbm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" # print h2o.dump_json(gbmResult) (classification_error, classErrorPctList, totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult) elif SPEEDRF: rfResult = h2o_cmd.runSpeeDRF( parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs ) print "speedrf end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) else: rfResult = h2o_cmd.runRF( parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs ) print "rf end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore( dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL ) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_RF_many_cols_enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), (10000, 3000, 'cK', 3600), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'RFModelKey' # Parse (train)**************************************** start = time.time() parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # RF(train iterate)**************************************** ntrees = 10 for max_depth in [5,10,20,40]: params = { 'nbins': 1024, 'classification': 1, 'ntrees': ntrees, 'max_depth': max_depth, 'response': 'C' + str(numCols-1), 'ignored_cols_by_name': None, } print "Using these parameters for RF: ", params kwargs = params.copy() trainStart = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) rfResult["drf_model"] = rfResult.pop("speedrf_model") errsLast = rfResult['drf_model']['errs'][-1] print "RF 'errsLast'", errsLast cm = rfResult['drf_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # just plot the last one if 1==1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rf_big1_overwrite_model_fvec(self): csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') firstRfView = None # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" if OVERWRITE_RF_MODEL: kwargs['ntrees'] = 1 + jobDispatch else: kwargs['ntrees'] = 1 # don't change the seed if we're overwriting the model. It should get # different results just from changing the tree count kwargs['seed'] = random.randint(0, sys.maxint) kwargs["response"] = "C107" # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) # FIX! are these already in there? rfView = {} rfView['_dataKey'] = hex_key rfView['_key'] = model_key print "rf job dispatch end on ", csvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # we're going to compare rf results to previous as we go along (so we save rf view results h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # In this test we're waiting after each one, so we can save the RFView results for comparison to future print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) if firstRfView is None: # we'll use this to compare the others firstRfView = rfViewResult.copy() firstModelKey = model_key print "firstRfView", h2o.dump_json(firstRfView) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference)))
def test_RF_mnist_both(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** # print "This is the 'ignore=' we'll use" # no longer use. depend on h2o to get it right. ntree = 25 params = { 'response': 0, 'ntrees': ntree, # 'data_key='mnist_training.csv.hex' 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 2147483647, 'select_stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample_rate': 0.67, 'oobee': 1, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'destination_key': 'RF_model', 'nbins': 1024, # 'seed': 784834182943470027, # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # RFView (score on test)**************************************** (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual(classification_error, 2.84, delta=0.5, msg="Classification error %s differs too much" % classification_error) treeStats = rfView['speedrf_model']['treeStats'] leaves = {'min': treeStats['minLeaves'], 'mean': treeStats['meanLeaves'], 'max': treeStats['maxLeaves']} # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100 d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = {'min': treeStats['minDepth'], 'mean': treeStats['meanDepth'], 'max': treeStats['maxDepth']} depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100 d = "seed: %s %s depth: %s expected: %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() modelKey = rfView['speedrf_model']['_key'] predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_rf_predict3_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() timeoutSecs = 600 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 1: y = 4 # last col response = 'response' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'iris2.csv.hex' # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0} # No translate because we're using an Exec to get the data out?, and that loses the encoding? translate = None # one wrong will be 0.66667. I guess with random, that can happen? expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 else: y = 0 # first col response = 'C1' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist_training.hex' translate = { \ '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \ '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 } expectedPctWrong = 0.7 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey + "=" + hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) + "]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col( csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col( csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1 - skipSrcOutputHeader) != (rowNum2 - skipPredictHeader)): raise Exception( "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o) != str(p): if wrong == 10: print "Not printing any more mismatches\n" elif wrong < 10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 2.0: raise Exception( "pctWrong too high. Expect < 2% error because it's reusing training data" ) return pctWrong #***************************************************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) kwargs = { 'destination_key': 'rf_model', 'response': response, 'ntrees': trees, 'classification': 1, } rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) rfResult["drf_model"] = rfResult.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" print "y=", y pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) # can be zero if memorized (iris is either 0 or 0.667?) # just make delta 0.7 for now self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=0.7, msg= "predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
def test_rf_covtype20x_fvec(self): importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest kwargs = {'str': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. paramDict = drf2ParamDict params = { 'ntrees': 20, 'destination_key': 'RF_model', 'response': "C55", } # colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() h2o_cmd.runSpeeDRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=timeoutSecs) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C55', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_rf_enums_mappings_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 3000 tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' # limit depth and number of trees to accentuate the issue with categorical split decisions if SPEEDRF: kwargs = { 'destination_key': modelKey, 'response': y, 'num_trees': 1, 'max_depth': 100, 'oobee': 1, 'seed': 123456789, } else: kwargs = { 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'validation': scoreDataKey, 'seed': 123456789, } for r in range(4): start = time.time() if SPEEDRF: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) else: rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' # print h2o.dump_json(rfResult) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_rf_big1_overwrite_model_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') firstRfView = None # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" if OVERWRITE_RF_MODEL: kwargs['num_trees'] = 1 + jobDispatch else: kwargs['num_trees'] = 1 # don't change the seed if we're overwriting the model. It should get # different results just from changing the tree count kwargs['seed'] = random.randint(0, sys.maxint) kwargs["response"] = "C107" # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) # FIX! are these already in there? rfView = {} rfView['_dataKey'] = hex_key rfView['_key'] = model_key print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch # we're going to compare rf results to previous as we go along (so we save rf view results h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # In this test we're waiting after each one, so we can save the RFView results for comparison to future print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) if firstRfView is None: # we'll use this to compare the others firstRfView = rfViewResult.copy() firstModelKey = model_key print "firstRfView", h2o.dump_json(firstRfView) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference)))