def test_rf_big1_nopoll(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(1): start = time.time() kwargs = {} # FIX! what model keys do these get? rfView = h2o_cmd.runRFOnly(parseKey=parseKey, model_key="RF_model"+str(jobDispatch),\ timeoutSecs=300, noPoll=True, **kwargs) rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='GLMModel', timeoutSecs=30, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for rfView in rfViewInitial: print "Checking completed job, with no polling:", rfView a = h2o.nodes[0].poll_url(rf['response'], noPoll=True) h2o_rf.simpleCheckRFView(None, a)
def doBoth(): h2o.verboseprint("Trial", trial) start = time.time() # make sure ntrees and max_depth are the same for both rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed1 = time.time() - start (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed2 = time.time() - start (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) print "Checking that results are similar (within 20%)" print "DRF2 then SpeeDRF" print "per-class variance is large..basically we can't check very well for this dataset" for i, (j,k) in enumerate(zip(classErrorPctList1, classErrorPctList2)): print "classErrorPctList[%s]:i %s %s" % (i, j, k) # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], # delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i) print "totalError: %s %s" % (totalError1, totalError2) self.assertAlmostEqual(totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF") print "elapsed: %s %s" % (elapsed1, elapsed2) self.assertAlmostEqual(elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")
def test_rf_covtype_train_oobe_fvec(self): h2o.beta_features = True print "\nRun test iterations/compare with covtype.data" rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False) (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv1) # since we created a binomial output class..look at the error rate for class 1 ce1pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.shuffled.data" rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True) (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv2) ce2pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.sorted.data" rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False) (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv3) ce3pct1 = classErrorPctList[1] print "rfv3, from covtype.sorted.data" print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv" df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertAlmostEqual(ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2)) self.assertAlmostEqual(ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3)) self.assertAlmostEqual(ce1pct1, ce2pct1, delta=0.5, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1)) self.assertAlmostEqual(ce1pct1, ce3pct1, delta=0.5, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
def test_RF(self): h2o.beta_features = True paramsTrainRF = { 'seed': '1234567890', # if I use 100, and just one tree, I should get same results for sorted/shuffled? # i.e. the bagging always sees everything. Means oobe will be messed up # so will specify validation = the 10pct holdout data (could reuse the training data?) 'sample_rate': 1.0, 'ntrees': 3, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # 90% data trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1) self.assertEqual(4.29, classification_error1) self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1) self.assertEqual(58101, totalScores1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) # 10% data trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2) self.assertEqual(4.29, classification_error2) self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2) self.assertEqual(58101, totalScores2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def runRFView(node=None, data_key=None, model_key=None, ntree=None, timeoutSecs=15, retryDelaySecs=2, noise=None, noPoll=False, noPrint=False, **kwargs): if not node: node = h2o.nodes[0] def test(n, tries=None): rfView = n.random_forest_view(data_key, model_key, timeoutSecs, noise=noise, **kwargs) status = rfView['response']['status'] numberBuilt = rfView['trees']['number_built'] if status == 'done': if numberBuilt!=ntree: raise Exception("RFview done but number_built!=ntree: %s %s", numberBuilt, ntree) return True if status != 'poll': raise Exception('Unexpected status: ' + status) progress = rfView['response']['progress'] progressTotal = rfView['response']['progress_total'] # want to double check all this because it's new # and we had problems with races/doneness before errorInResponse = \ numberBuilt<0 or ntree<0 or numberBuilt>ntree or \ progress<0 or progressTotal<0 or progress>progressTotal or \ ntree!=rfView['ntree'] ## progressTotal!=ntree or # rfView better always agree with what RF ntree was if errorInResponse: raise Exception("\nBad values in response during RFView polling.\n" + "progress: %s, progressTotal: %s, ntree: %s, numberBuilt: %s, status: %s" % \ (progress, progressTotal, ntree, numberBuilt, status)) # don't print the useless first poll. # UPDATE: don't look for done. look for not poll was missing completion when looking for done if (status=='poll'): if numberBuilt==0: h2o.verboseprint(".") else: h2o.verboseprint("\nRFView polling #", tries, "Status: %s. %s trees done of %s desired" % (status, numberBuilt, ntree)) return (status!='poll') if noPoll: return None node.stabilize( test, 'random forest reporting %d trees' % ntree, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs) # kind of wasteful re-read, but maybe good for testing rfView = node.random_forest_view(data_key, model_key, timeoutSecs, noise=noise, **kwargs) if not kwargs.setdefault('no_confusion_matrix', False): h2f.simpleCheckRFView(node, rfView, noPrint=noPrint) return rfView
def runRFScore(node=None, data_key=None, model_key=None, ntree=None, timeoutSecs=15, retryDelaySecs=2, doSimpleCheck=True, **kwargs): if not node: node = h2o.nodes[0] # kind of wasteful re-read, but maybe good for testing rfView = node.random_forest_score(data_key, model_key, timeoutSecs, **kwargs) if doSimpleCheck: h2f.simpleCheckRFView(node, rfView, noPrint=noPrint) return rfView
def test_rf_covtype_train_oobe_fvec(self): h2o.beta_features = True print "\nRun test iterations/compare with covtype.data" rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False) (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv1) # since we created a binomial output class..look at the error rate for class 1 ce1pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.shuffled.data" rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True) (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv2) ce2pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.sorted.data" rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False) (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv3) ce3pct1 = classErrorPctList[1] print "rfv3, from covtype.sorted.data" print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv" print "rfv1:", h2o.dump_json(rfv1) print "rfv3:", h2o.dump_json(rfv3) # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True) df = h2o_util.JsonDiff(rfv1, rfv3) print "df.difference:", h2o.dump_json(df.difference) self.assertAlmostEqual( ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2)) self.assertAlmostEqual( ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3)) self.assertAlmostEqual( ce1pct1, ce2pct1, delta=1.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1)) self.assertAlmostEqual( ce1pct1, ce3pct1, delta=1.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
def test_rf_covtype_train_full_fvec(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = kwargs['ntrees'] * 60 start = time.time() print "Note train.csv is used for both train and validation" rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) job_key = rfv['job_key'] model_key = rfv['destination_key'] rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1, print_params=True) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) self.assertLess(classification_error, 3, "train.csv should have full classification error: %s < 3" % classification_error) print "Trial #", trial, "completed"
def test_rf_multinomial_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_multinomial.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 400 colCount = 7 for trial in range (5): write_syn_dataset(csvPathname, totalRows, colCount, headerData) # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hexKey = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey, doSummary=True) start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) modelKey = rfView['drf_model']['_key'] h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=colCount+1, vpredict=1, expectedAuc=0.5, doAUC=False) h2b.browseJsonHistoryAsUrlLastMatch("RF")
def test_rf_params_rand2(self): csvPathname = 'space_shuttle_damage.csv' for trial in range(10): # params is mutable. This is default. params = { 'sample': 80, 'stat_type': 'ENTROPY', 'class_weights': 'yes=1000', 'ntree': 50, 'response_variable': 'damage', 'ignore': 'flight', 'ntree': 25, 'out_of_bag_error_estimate': 1, } print "params:", params colX = h2o_rf.pickRandRfParams(paramDict, params) print "params:", params kwargs = params.copy() timeoutSecs = 180 start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time()-start # just to get the list of per class errors (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True) print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" # why does this vary between 22 and 23 self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes' self.assertLess(classErrorPctList[0],95) # class 0 is 'no' self.assertLess(classErrorPctList[1],29) # class 1 is 'yes' self.assertLess(classification_error,61)
def test_rf_covtype_train_full_fvec(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = kwargs['ntrees'] * 60 start = time.time() print "Note train.csv is used for both train and validation" rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) job_key = rfv['job_key'] model_key = rfv['destination_key'] rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) # hmm..just using defaults above in RF? self.assertLess(classification_error, 4.8, "train.csv should have full classification error: %s < 4.8" % classification_error) print "Trial #", trial, "completed"
def test_from_import_fvec(self): csvFilenameAll = [ ("covtype.data", 500), # ("covtype20x.data", 1000), ] for (csvFilename, timeoutSecs) in csvFilenameAll: # creates csvFilename.hex from file in importFolder dir hex_key = csvFilename + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True) h2o_cmd.infoFromInspect(inspect, parseResult['destination_key']) summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key']) # h2o_cmd.infoFromSummary(summaryResult) trees = 2 start = time.time() rfView = h2o_cmd.runRF(trees=trees, max_depth=20, balance_classes=0, importance=1, parseResult=parseResult, timeoutSecs=timeoutSecs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n"+l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_1ktrees_job_cancel_many_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range (1,5): # random 0 or 1 delay delay = random.uniform(0,1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def test_rf_params_rand2(self): csvPathname = h2o.find_file('smalldata/space_shuttle_damage.csv') for trial in range(10): # params is mutable. This is default. params = { 'sample': 80, 'stat_type': 'ENTROPY', 'class_weights': 'yes=1000', 'ntree': 50, 'parallel': 1, 'response_variable': 'damage', 'ignore': 'flight', 'ntree': 25, 'out_of_bag_error_estimate': 1, } print "params:", params colX = h2o_rf.pickRandRfParams(paramDict, params) print "params:", params kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10) start = time.time() rfView = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start # just to get the list of per class errors (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True) print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" # why does this vary between 22 and 23 self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes' self.assertLess(classErrorPctList[0],95) # class 0 is 'no' self.assertLess(classErrorPctList[1],29) # class 1 is 'yes' self.assertLess(classification_error,61)
def test_rf_float_bigexp_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 1000 colCount = 7 write_syn_dataset(csvPathname, totalRows, colCount, headerData) for trial in range(5): # grow the data set rowData = rand_rowData(colCount) num = random.randint(4096, 10096) append_syn_dataset(csvPathname, colCount, num) totalRows += num # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True) start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) inspect = h2o_cmd.runInspect(key=hex_key) cols = inspect['cols'] numCols = inspect['numCols'] for i, c in enumerate(cols): if i < ( numCols - 1 ): # everything except the last col (output) should be 8 byte float colType = c['type'] self.assertEqual(colType, 'Real', msg="col %d should be type Real: %s" % (i, colType)) h2o.check_sandbox_for_errors()
def doBoth(): h2o.verboseprint("Trial", trial) start = time.time() # make sure ntrees and max_depth are the same for both rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed1 = time.time() - start (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed2 = time.time() - start (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) print "Checking that results are similar (within 20%)" print "DRF2 then SpeeDRF" print "per-class variance is large..basically we can't check very well for this dataset" for i, (j, k) in enumerate(zip(classErrorPctList1, classErrorPctList2)): print "classErrorPctList[%s]:i %s %s" % (i, j, k) # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], # delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i) print "totalError: %s %s" % (totalError1, totalError2) self.assertAlmostEqual( totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF") print "elapsed: %s %s" % (elapsed1, elapsed2) self.assertAlmostEqual( elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")
def test_rf_float_bigexp_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 1000 colCount = 7 write_syn_dataset(csvPathname, totalRows, colCount, headerData) for trial in range (5): # grow the data set rowData = rand_rowData(colCount) num = random.randint(4096, 10096) append_syn_dataset(csvPathname, colCount, num) totalRows += num # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # cm0 = rfView['drf_model']['cms'][0]['_arr'] # print cm0 # self.assertEqual(len(cm0), numCols, # msg="%s cols in cm, means rf must have ignored some cols. I created data with %s cols" % (len(cm0), numCols-1)) inspect = h2o_cmd.runInspect(key=hex_key) cols = inspect['cols'] numCols = inspect['numCols'] for i,c in enumerate(cols): if i < (numCols-1): # everything except the last col (output) should be 8 byte float colType = c['type'] self.assertEqual(colType, 'Real', msg="col %d should be type Real: %s" % (i, colType)) ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_rf_hhp_2a_fvec(self): h2o.beta_features = True csvFilenameList = { 'hhp.cut3.214.data.gz', } for csvFilename in csvFilenameList: csvPathname = csvFilename print "RF start on ", csvPathname dataKeyTrain = 'rTrain.hex' start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=dataKeyTrain, schema='put', timeoutSecs=120) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] # we want the last col. Should be values 0 to 14. 14 most rare # from the cut3 set # 84777 0 # 13392 1 # 6546 2 # 5716 3 # 4210 4 # 3168 5 # 2009 6 # 1744 7 # 1287 8 # 1150 9 # 1133 10 # 780 11 # 806 12 # 700 13 # 345 14 # 3488 15 execExpr = "%s[,%s] = %s[,%s]==14" % (dataKeyTrain, numCols, dataKeyTrain, numCols) h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=dataKeyTrain) h2o_cmd.infoFromInspect(inspect, "going into RF") execResult = {'destination_key': dataKeyTrain} kwargs = { 'ntrees': 20, 'max_depth': 20, 'nbins': 50, } rfView = h2o_cmd.runRF(parseResult=execResult, timeoutSecs=900, retryDelaySecs=10, **kwargs) print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds' (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
def test_rf_big1_nopoll(self): csvFilename = 'hhp_107_01.data.gz' csvPathname = h2o.find_file("smalldata/" + csvFilename) key2 = csvFilename + ".hex" print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=15) rfViewInitial = [] rfView = {} # dispatch multiple jobs back to back for jobDispatch in range(25): start = time.time() kwargs = {} model_key = 'RF_model' + str(jobDispatch) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runRFOnly(node=randomNode, parseKey=parseKey, model_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) print "rfView:", h2o.dump_json(rfView) # FIX! are these already in there? rfView['data_key'] = key2 rfView['model_key'] = model_key rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for rfView in rfViewInitial: print "Checking completed job, with no polling:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] a = h2o.nodes[0].random_forest_view(data_key, model_key, noPoll=True) h2o_rf.simpleCheckRFView(None, a)
def test_rf_airlines_2013_fvec(self): h2o.beta_features = True h2b.browseTheCloud() csvFilename = 'year2013.csv' hex_key = 'year2013.hex' importFolderPath = 'airlines' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False) parse_time = time.time() - start print "parse took {0} sec".format(parse_time) start = time.time() start = time.time() # noise=['JStack','cpu','disk']) h2o_cmd.runSummary(key=hex_key, timeoutSecs=200) elapsed = time.time() - start print "summary took {0} sec".format(elapsed) trees = 10 paramsTrainRF = { 'ntrees': trees, 'max_depth': 20, 'nbins': 200, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed', 'timeoutSecs': 14800, } kwargs = paramsTrainRF.copy() start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n" + l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_1ktrees_job_cancel_many_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range(1, 50): # random 0 or 1 delay delay = random.uniform(0, 1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=False, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def test_rf_float_bigexp_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 1000 colCount = 7 write_syn_dataset(csvPathname, totalRows, colCount, headerData) for trial in range (5): # grow the data set rowData = rand_rowData(colCount) num = random.randint(4096, 10096) append_syn_dataset(csvPathname, colCount, num) totalRows += num # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'response': 'AGE', 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True) start = time.time() rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=15, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, ntree=ntree) inspect = h2o_cmd.runInspect(key=hex_key) cols = inspect['cols'] #num_cols = inspect['num_cols'] #for i,c in enumerate(cols): # if i < (num_cols-1): # everything except the last col (output) should be 8 byte float # colType = c['type'] # self.assertEqual(colType, 'float', msg="col %d should be type Real: %s" % (i, colType)) h2o.check_sandbox_for_errors()
def test_rf_multinomial_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_multinomial.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 400 colCount = 7 for trial in range(5): write_syn_dataset(csvPathname, totalRows, colCount, headerData) # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hexKey = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey, doSummary=True) start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) modelKey = rfView['drf_model']['_key'] h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=colCount + 1, vpredict=1, expectedAuc=0.5, doAUC=False) h2b.browseJsonHistoryAsUrlLastMatch("RF")
def test_rf_airlines_2013_fvec(self): h2o.beta_features = True h2b.browseTheCloud() csvFilename = 'year2013.csv' hex_key = 'year2013.hex' importFolderPath = 'airlines' csvPathname = importFolderPath + "/" + csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False) parse_time = time.time() - start print "parse took {0} sec".format(parse_time) start = time.time() start = time.time() # noise=['JStack','cpu','disk']) h2o_cmd.runSummary(key=hex_key, timeoutSecs=200) elapsed = time.time() - start print "summary took {0} sec".format(elapsed) trees = 10 paramsTrainRF = { 'ntrees': trees, 'max_depth': 20, 'nbins': 200, 'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed', 'timeoutSecs': 14800, } kwargs = paramsTrainRF.copy() start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs) elapsed = time.time() - start (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \ trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, trees, classification_error, classErrorPctList, totalScores) print "\n"+l h2o.cloudPerfH2O.message(l) # just to make sure we test this h2i.delete_keys_at_all_nodes(pattern=hex_key)
def test_rf_params_rand2(self): csvPathname = 'space_shuttle_damage.csv' for trial in range(10): # params is mutable. This is default. params = { 'sample': 80, 'stat_type': 'ENTROPY', 'class_weights': 'yes=1000', 'ntree': 50, 'response_variable': 'damage', 'ignore': 'flight', 'ntree': 25, 'out_of_bag_error_estimate': 1, } print "params:", params colX = h2o_rf.pickRandRfParams(paramDict, params) print "params:", params kwargs = params.copy() timeoutSecs = 180 start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start # just to get the list of per class errors (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True) print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs), "\n" # why does this vary between 22 and 23 self.assertAlmostEqual(totalScores, 23, delta=1) # class 1 is 'yes' self.assertLess(classErrorPctList[0], 95) # class 0 is 'no' self.assertLess(classErrorPctList[1], 29) # class 1 is 'yes' self.assertLess(classification_error, 61)
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathname = h2o.find_file('smalldata/space_shuttle_damage.csv') for trial in range(10): # params is mutable. This is default. params = { 'sample': 80, 'gini': 0, 'class_weights': 'yes=1000', 'ntree': 50, 'parallel': 1, 'response_variable': 'damage', 'ignore': 'flight', 'ntree': 25, 'out_of_bag_error_estimate': 1, } print "params:", params colX = h2o_rf.pickRandRfParams(paramDict, params) print "params:", params kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10) start = time.time() rfView = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start # just to get the list of per class errors (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noprint=True) print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" self.assertEqual(totalScores,23) # class 1 is 'yes' self.assertLess(classErrorPctList[0],82) # class 0 is 'no' self.assertLess(classErrorPctList[1],29) # class 1 is 'yes' self.assertLess(classification_error,61)
def test_rf_covtype_train_full(self): h2o.beta_features = True csvFilename = 'train.csv' csvPathname = 'bench/covtype/h2o/' + csvFilename parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", header=1, timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) self.assertLess(classification_error, 0.02, "train.csv should have full classification error <0.02") print "Trial #", trial, "completed"
def test_rf_covtype_train_full_fvec(self): h2o.beta_features = True csvFilename = "covtype.data" csvPathname = "standard/" + csvFilename parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=180 ) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = kwargs["ntrees"] * 60 start = time.time() print "Note train.csv is used for both train and validation" rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5) elapsed = time.time() - start print "RF end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) job_key = rfv["job_key"] model_key = rfv["destination_key"] rfv = h2o_cmd.runRFView( data_key=parseResult["destination_key"], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1 ) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) # hmm..just using defaults above in RF? self.assertLess( classification_error, 4.8, "train.csv should have full classification error: %s < 4.8" % classification_error, ) print "Trial #", trial, "completed"
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest kwargs = {'str': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. paramDict = drf2ParamDict params = {'ntrees': 20, 'destination_key': 'RF_model'} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual( classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C55', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_RF(self): h2o.beta_features = True paramsTrainRF = { 'seed': '1234567890', # if I use 100, and just one tree, I should get same results for sorted/shuffled? # i.e. the bagging always sees everything. Means oobe will be messed up # so will specify validation = the 10pct holdout data (could reuse the training data?) 'sample_rate': 1.0, 'ntrees': 3, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C55', } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # 90% data trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1) # self.assertEqual(4.29, classification_error1) # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1) # with new RNG 9/26/14 self.assertEqual(4.4, classification_error1) self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1) self.assertEqual(58101, totalScores1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) # 10% data trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2) # self.assertEqual(4.29, classification_error2) # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2) # with new RNG 9/26/14 self.assertEqual(4.4, classification_error1) self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1) self.assertEqual(58101, totalScores2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_RF_mnist(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 10 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_enums_mappings_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 3000 tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' # limit depth and number of trees to accentuate the issue with categorical split decisions if SPEEDRF: kwargs = { 'destination_key': modelKey, 'response': y, 'num_trees': 1, 'max_depth': 100, 'oobee': 1, 'seed': 123456789, } else: kwargs = { 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'validation': scoreDataKey, 'seed': 123456789, } for r in range(4): start = time.time() if SPEEDRF: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) else: rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' # print h2o.dump_json(rfResult) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial%10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' createTestTrain(hex_key, dataKeyTrain, dataKeyTest, percent=0.90, outputClass=4, numCols=numCols) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str(trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_selfKey'] rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, used_trees, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfvScoring) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp # return the last rfv done during training return rfv
def test_RF_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 100 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_reals_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=False, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_log_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 100, 'cA', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # CREATE test dataset****************************************************** csvFilename = 'syn_test_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) testParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Test Parse result['destination_key']:", testParseResult[ 'destination_key'] dataKeyTest = testParseResult['destination_key'] # CREATE train dataset****************************************************** csvFilename = 'syn_train_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) trainParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Train Parse result['destination_key']:", trainParseResult[ 'destination_key'] dataKeyTrain = trainParseResult['destination_key'] # RF train****************************************************** # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # do oobe kwargs['response'] = "C" + str(colCount + 1) rfv = h2o_cmd.runRF(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) oobeTrainPctRight = 100.0 - classification_error expectTrainPctRight = 94 self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRight,\ msg="OOBE: pct. right for training not close enough %6.2f %6.2f"% (oobeTrainPctRight, expectTrainPctRight), delta=5) # RF score****************************************************** print "Now score with the 2nd random dataset" rfv = h2o_cmd.runRFView(data_key=dataKeyTest, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) self.assertTrue(classification_error <= 5.0, msg="Classification error %s too big" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100.0 - classification_error expectScorePctRight = 94 self.assertTrue( fullScorePctRight >= expectScorePctRight, msg="Full: pct. right for scoring not close enough %6.2f %6.2f" % (fullScorePctRight, expectScorePctRight), delta=5)
def test_export_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename trainKey = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=trainKey, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, trainKey) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename testKey = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=testKey, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, testKey) trial = 0 ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? trial += 1 kwargs = paramDict.copy() modelKey = 'RFModel_' + str(trial) kwargs['destination_key'] = modelKey start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, **kwargs) trainElapsed = time.time() - start print 'rf train end on', csvTrainPathname, 'took', trainElapsed, 'seconds' h2o.nodes[0].export_files(src_key=testKey, path=SYNDATASETS_DIR + "/" + testKey, force=1) h2o.nodes[0].export_files(src_key=trainKey, path=SYNDATASETS_DIR + "/" + trainKey, force=1) # h2o.nodes[0].export_files(src_key=modelKey, path=SYNDATASETS_DIR + "/" + modelKey, force=1) rf_model = rfResult['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) print "classErrorPctList:", classErrorPctList self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict") # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey)
def test_rfview_score(self): csvPathnameTrain = 'UCI/UCI-large/covtype/covtype.data' print "Train with:", csvPathnameTrain parseResultTrain = h2i.import_parse(bucket='datasets', path=csvPathnameTrain, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTrain = parseResultTrain['destination_key'] csvPathnameTest = 'UCI/UCI-large/covtype/covtype.data' print "Test with:", csvPathnameTest parseResultTest = h2i.import_parse(bucket='datasets', path=csvPathnameTest, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTest = parseResultTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # new web page for predict? throw it in here for now (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if 'sampling_strategy' in kwargs and kwargs['sampling_strategy'] != 'STRATIFIED_LOCAL': check_err = True else: check_err = False if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_rf_predict3_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() timeoutSecs = 600 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 1: y = 4 # last col response = 'response' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'iris2.csv.hex' # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0} # No translate because we're using an Exec to get the data out?, and that loses the encoding? translate = None # one wrong will be 0.66667. I guess with random, that can happen? expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 elif 1 == 0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' translate = { '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7 } expectedPctWrong = 0.7 else: y = 0 # first col response = 'C1' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist_training.hex' translate = { \ '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \ '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 } expectedPctWrong = 0.7 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey + "=" + hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) + "]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col( csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col( csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1 - skipSrcOutputHeader) != (rowNum2 - skipPredictHeader)): raise Exception( "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o) != str(p): if wrong == 10: print "Not printing any more mismatches\n" elif wrong < 10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong) / len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 2.0: raise Exception( "pctWrong too high. Expect < 2% error because it's reusing training data" ) return pctWrong #***************************************************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) kwargs = { 'destination_key': 'rf_model', 'response': response, 'ntrees': trees, 'classification': 1, } rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) rfResult["drf_model"] = rfResult.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" print "y=", y pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) # can be zero if memorized (iris is either 0 or 0.667?) # just make delta 0.7 for now self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=0.7, msg= "predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
def test_RF_mnist_both(self): importFolderPath = "/home/0xdiag/datasets/mnist_repl" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, parsePattern, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 100 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, # 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0, sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=False, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual( classification_error, 0.028, delta=0.01, msg="Classification error %s differs too much" % classification_error) leaves = rfView['trees']['leaves'] # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100 d = "seed: %s leaves %s %s %s pct. different %s" % ( params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = rfView['trees']['depth'] depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100 d = "seed: %s depth %s %s %s pct. different %s" % ( params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_RF_mnist_both(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** # print "This is the 'ignore=' we'll use" # no longer use. depend on h2o to get it right. ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntree = 25 params = { 'response_variable': 0, # 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, # 'seed': 784834182943470027, 'use_non_local_data': 1, # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error) leaves = rfView['trees']['leaves'] # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100 d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = rfView['trees']['depth'] depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100 d = "seed: %s %s depth: %s expected: %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_rf_mnist_both_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "Not using ignore from this..have to adjust cols?" h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntree = 2 params = { 'response': 'C1', # 'ignored_cols_by_name': ignore_x, 'ntrees': ntree, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': 'RF_model', 'nbins': 100, 'importance': 0, 'balance_classes': 0, } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print 'rfView:', h2o.dump_json(rfView) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['drf_model']['_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) # training and test data are unique, so error won't be low? # self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error) leaves = { 'min': rfView['drf_model']['treeStats']['minLeaves'], 'mean': rfView['drf_model']['treeStats']['meanLeaves'], 'max': rfView['drf_model']['treeStats']['maxLeaves'], } # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 537, 'mean': 1118.05, 'max': 1701} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100 d = "seed: %s leaves %s %s %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = { 'min': rfView['drf_model']['treeStats']['minDepth'], 'mean': rfView['drf_model']['treeStats']['meanDepth'], 'max': rfView['drf_model']['treeStats']['maxDepth'], } depthExpected = {'min': 20, 'mean': 20, 'max': 20} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100 d = "seed: %s depth %s %s %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_RF_mnist_reals_fvec(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) ntrees = 10 params = { 'response': 'C1', 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'RF_model', 'nbins': 1024, 'seed': 784834182943470027, 'importance': 0, 'balance_classes': 0, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfv = h2o_cmd.runRFView(data_key=testKey2, model_key=model_key, ntrees=ntrees, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfv, **params) self.assertAlmostEqual(classification_error, 9, delta=1.0, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_RF_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" # x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, schema='local', hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) params = { 'response': 'C' + str(y), 'cols': None, 'ignored_cols_by_name': ignore_x, 'classification': 1, 'validation': None, 'ntrees': 10, 'max_depth': 20, 'min_rows': None, 'nbins': 1000, 'mtries': None, 'sample_rate': 0.66, 'seed': None, } rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow params['destination_key'] = 'RFModel_' + str('jobDispatch') kwargs = params.copy() timeoutSecs = 1200 start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, rfView=DO_POLL, **kwargs) elapsed = time.time() - start # print h2o.dump_json(rfResult) print "rf job dispatch end on ", trainCsvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # FIX! are these already in there? rfView = {} rfView['data_key'] = trainKey2 rfView['model_key'] = kwargs['destination_key'] rfView['ntrees'] = kwargs['ntrees'] rfViewInitial.append(rfView) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) # FIX! need to add the rfview and predict stuff # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntrees = rfView['ntrees'] rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, noPoll=not DO_POLL, doSimpleCheck=False) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) self.assertAlmostEqual( classification_error, 10, delta=2, msg="Classification error %s differs too much" % classification_error) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=5) # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False) # print "rfView:", h2o.dump_json(rfView) # "N":1, # "errs":[0.25,0.1682814508676529], # "testKey":"syn_binary_10000x10.hex", # "cm":[[3621,1399],[1515,3465]]}} rf_model = rfView['drf_model'] cms = rf_model['cms'] ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] # FIX! should update this expected classification error ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
def test_rf_enums_score_superset_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 3000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for rf model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' ntrees = 5 kwargs = { 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': ntrees, 'validation': scoreDataKey, } start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees) predictKey = 'Predict.hex' h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_rf_enums_mappings(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' # limit depth and number of trees to accentuate the issue with categorical split decisions # use mtries so both look at all cols at every split? doesn't matter for speedrf # does speedrf try one more time? with 3 cols, mtries=2, so another try might # get a look at the missing col # does matter for drf2. does it "just stop" # trying mtries always looking at all columns or 1 col might be interesting if SPEEDRF: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'ntrees': 1, 'max_depth': 100, # 'oobee': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'mtries': COLS, } elif GBM: kwargs = { 'destination_key': modelKey, 'response': y, 'validation': scoreDataKey, 'seed': 123456789, # 'learn_rate': .1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'classification': 1, } else: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'nbins': 1024, 'mtries': COLS, } for r in range(2): start = time.time() if GBM: gbmResult = h2o_cmd.runGBM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "gbm end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' # print h2o.dump_json(gbmResult) (classification_error, classErrorPctList, totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult) elif SPEEDRF: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "speedrf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) else: rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_speedrf_mnist(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("train.csv.gz", "test.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 784 # last column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) ntrees = 10 params = { 'response': y, 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'SpeeDRF_model', 'nbins': 1024, 'seed': 784834182943470027, 'oobee': 1, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) rfv["drf_model"] = rfv.pop("speedrf_model") h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] print "Total trees: ", used_trees print "On data key: ", data_key print "Produced model key: ", model_key
def test_rf_change_data_key_fvec(self): importFolderPath = 'standard' csvFilenameTrain = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseResult = parseResult # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. params = {'ntrees': 2, 'destination_key': 'RF_model'} # colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() kwargs["response"] = "C55" # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 100 start = time.time() h2o_cmd.runSpeeDRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=360, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntrees = kwargs['ntrees'] start = time.time() h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) # FIX! should update this expected classification error # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' print "Trial #", trial, "completed"
def test_RF_mnist_both(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** # print "This is the 'ignore=' we'll use" # no longer use. depend on h2o to get it right. ntree = 25 params = { 'response': 0, 'ntrees': ntree, # 'data_key='mnist_training.csv.hex' 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 2147483647, 'select_stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample_rate': 0.67, 'oobee': 1, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'destination_key': 'RF_model', 'nbins': 1024, # 'seed': 784834182943470027, # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0, sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # RFView (score on test)**************************************** (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) # was 2.84 # sometimes get 2.87? self.assertAlmostEqual( classification_error, 1.6, delta=1.6, msg="Classification error %s differs too much" % classification_error) treeStats = rfView['speedrf_model']['treeStats'] leaves = { 'min': treeStats['minLeaves'], 'mean': treeStats['meanLeaves'], 'max': treeStats['maxLeaves'] } # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100 d = "seed: %s %s leaves: %s expected: %s pct. different %s" % ( params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = { 'min': treeStats['minDepth'], 'mean': treeStats['meanDepth'], 'max': treeStats['maxDepth'] } depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100 d = "seed: %s %s depth: %s expected: %s pct. different %s" % ( params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() modelKey = rfView['speedrf_model']['_key'] predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def run_rf(files,configs): overallWallStart = time.time() output = None #if not os.path.exists('rfbench.csv'): # output = open('rfbench.csv','w') # output.write(','.join(csv_header)+'\n') #else: # output = open('rfbench.csv','a') #csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, # dialect='excel', extrasaction='ignore',delimiter=',') #csvWrt.writeheader() try: java_heap_GB = h2o.nodes[0].java_heap_GB #Train File Parsing# trainParseWallStart = time.time() print "Training file is: ", files['train'] importFolderPath = "mnist/mnist8m" csvPathname = importFolderPath + "/" + files['train'] hex_key = files['train'] + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=120) trainParseWallTime = time.time() - trainParseWallStart #End Train File Parse# inspect = h2o.nodes[0].inspect(parseResult['destination_key']) row = {'java_heap_GB':java_heap_GB,'dataset':'mnist8m', 'nTrainRows': inspect['numRows'],'nCols':inspect['numCols'], #'nIgnoredCols':nIgnoredCols,'ignoredCols':ignoredCols, 'trainParseWallTime':trainParseWallTime} #RF+RFView (train)# kwargs = configs.copy() trainRFStart = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult,rfView=True, timeoutSecs= 3600,pollTimeoutSecs= 60,retryDelaySecs = 2, **kwargs) trainViewTime = time.time() - trainRFStart #End RF+RFView (train)# row.update({'trainViewTime':trainViewTime}) h2o_rf.simpleCheckRFView(None, rfView, **kwargs) modelKey = rfView['model_key'] #Test File Parsing# testParseWallStart = time.time() print "Testing file is: ", files['test'] importFolderPath = "mnist/mnist8m" csvPathname = importFolderPath + "/" + files['test'] hex_key = files['test'] + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=120) testParseWallTime = time.time() - testParseWallStart #End Test File Parse# inspect = h2o.nodes[0].inspect(parseResult['destination_key']) row.update({'nTestRows':inspect['numRows']}) row.update({'testParseWallTime':testParseWallTime}) modelKey = rfView['model_key'] #RFView (score on test)# kwargs = configs.copy() testRFStart = time.time() kwargs.update({'model_key':modelKey,'ntree':10}) rfView = h2o_cmd.runRFView(data_key=hex_key,timeoutSecs=180, doSimpleCheck=False,**kwargs) testViewTime = time.time() - testRFStart #End RFView (score on test)# pprint(rfView) errRate = rfView['confusion_matrix']['classification_error'] row.update({'testViewTime':testViewTime}) overallWallTime = time.time() - overallWallStart row.update({'overallWallTime':overallWallTime}) row.update({'errRate':errRate}) print row #csvWrt.writerow(row) #h2o.nodes[0].remove_key(k) finally: output.close()
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest if h2o.beta_features: kwargs = {'str': execExpr, 'timeoutSecs': 15} else: kwargs = {'expression': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. if h2o.beta_features: paramDict = drf2ParamDict params = { 'ntrees': 20, 'destination_key': 'RF_model' } else: paramDict = drf1ParamDict params = { 'ntree': 20, 'out_of_bag_error_estimate': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() if h2o.beta_features: timeoutSecs = 30 + kwargs['ntrees'] * 60 else: timeoutSecs = 30 + kwargs['ntree'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" if h2o.beta_features: model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] else: model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C54', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_rf_predict3_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() timeoutSecs = 600 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1==1: y = 4 # last col response = 'response' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 bucket = 'smalldata' csvPathname = 'iris/iris2.csv' hexKey = 'iris2.csv.hex' # translate = {'setosa': 0.0, 'versicolor': 1.0, 'virginica': 2.0} # No translate because we're using an Exec to get the data out?, and that loses the encoding? translate = None # one wrong will be 0.66667. I guess with random, that can happen? expectedPctWrong = 0.7 elif 1==0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7} expectedPctWrong = 0.7 elif 1==0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 40 # try smaller data set compared to covtype bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' # translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0} translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7} expectedPctWrong = 0.7 elif 1==0: y = 54 # last col response = 'C55' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' translate = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7} expectedPctWrong = 0.7 else: y = 0 # first col response = 'C1' skipSrcOutputHeader = 1 skipPredictHeader = 1 trees = 6 bucket = 'home-0xdiag-datasets' csvPathname = 'mnist/mnist_training.csv.gz' hexKey = 'mnist_training.hex' translate = { \ '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, \ '5': 5, '6': 6, '7': 7, '8': 8, '9': 9 } expectedPctWrong = 0.7 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) def predict_and_compare_csvs(model_key, hex_key, translate=None, y=0): # have to slice out col 0 (the output) and feed result to predict # cols are 0:784 (1 output plus 784 input features # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30) dataKey = "P.hex" h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting if skipSrcOutputHeader: print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer" print "hack for now, can't chop out col 0 in Exec currently" dataKey = hex_key else: print "No header in dataset, can't chop out cols, since col numbers are used for names" dataKey = hex_key # +1 col index because R-like h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=hexKey, destination_key=predictHexKey) print "generate_predictions end on ", hexKey, " took", time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(key=predictHexKey) h2o_cmd.infoFromInspect(inspect, 'predict.hex') h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname) h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname) h2o.check_sandbox_for_errors() print "Do a check of the original output col against predicted output" (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname, msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader) (rowNum2, predictOutput) = compare_csv_at_one_col(csvPredictPathname, msg="Predicted", colIndex=0, skipHeader=skipPredictHeader) # no header on source if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)): raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \ %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader)) wrong = 0 for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)): # if float(o)!=float(p): if str(o)!=str(p): if wrong==10: print "Not printing any more mismatches\n" elif wrong<10: msg = "Comparing original output col vs predicted. row %s differs. \ original: %s predicted: %s" % (rowNum, o, p) print msg wrong += 1 print "\nTotal wrong:", wrong print "Total:", len(originalOutput) pctWrong = (100.0 * wrong)/len(originalOutput) print "wrong/Total * 100 ", pctWrong # I looked at what h2o can do for modelling with binomial and it should get better than 25% error? if pctWrong > 2.0: raise Exception("pctWrong too high. Expect < 2% error because it's reusing training data") return pctWrong #***************************************************************************** parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) kwargs = { 'destination_key': 'rf_model', 'response': response, 'ntrees': trees, 'classification': 1, } rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) rfResult["drf_model"] = rfResult.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) print "Use H2O GeneratePredictionsPage with a H2O generated model and the same data key." print "Does this work? (feeding in same data key)if you're predicting, " print "don't you need one less column (the last is output?)" print "WARNING: max_iter set to 8 for benchmark comparisons" print "y=", y pctWrong = predict_and_compare_csvs(model_key='rf_model', hex_key=hexKey, translate=translate, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) # can be zero if memorized (iris is either 0 or 0.667?) # just make delta 0.7 for now self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 0.7, msg="predicted pctWrong: %s should be small because we're predicting with training data" % pctWrong)
def test_rfview_score(self): csvPathnameTrain = 'standard/covtype.data' print "Train with:", csvPathnameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathnameTrain, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTrain = parseResultTrain['destination_key'] csvPathnameTest = 'standard/covtype.data' print "Test with:", csvPathnameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathnameTest, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTest = parseResultTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = {'ntree': 13, 'out_of_bag_error_estimate': 0} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, **kwargs) # new web page for predict? throw it in here for now (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if 'sampling_strategy' in kwargs and kwargs[ 'sampling_strategy'] != 'STRATIFIED_LOCAL': check_err = True else: check_err = False if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i == (TRIES - 1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, retryDelaySecs=5, doSimpleCheck=False) print "rfView:", h2o.dump_json(rfView) rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual( len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict" ) # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rf_log_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 100, 'cA', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # CREATE test dataset****************************************************** csvFilename = 'syn_test_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) testParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Test Parse result['destination_key']:", testParseResult['destination_key'] dataKeyTest = testParseResult['destination_key'] # CREATE train dataset****************************************************** csvFilename = 'syn_train_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) trainParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Train Parse result['destination_key']:", trainParseResult['destination_key'] dataKeyTrain = trainParseResult['destination_key'] # RF train****************************************************** # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # do oobe kwargs['response'] = "C" + str(colCount+1) rfv = h2o_cmd.runRF(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) oobeTrainPctRight = 100.0 - classification_error expectTrainPctRight = 94 self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRight,\ msg="OOBE: pct. right for training not close enough %6.2f %6.2f"% (oobeTrainPctRight, expectTrainPctRight), delta=5) # RF score****************************************************** print "Now score with the 2nd random dataset" rfv = h2o_cmd.runRFView(data_key=dataKeyTest, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) self.assertTrue(classification_error<=5.0, msg="Classification error %s too big" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100.0 - classification_error expectScorePctRight = 94 self.assertTrue(fullScorePctRight >= expectScorePctRight, msg="Full: pct. right for scoring not close enough %6.2f %6.2f"% (fullScorePctRight, expectScorePctRight), delta=5)