def notest_RF_poker100(self): h2o.beta_features = True trees = 6 timeoutSecs = 20 csvPathname = 'poker/poker100' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs)
def test_RFhhp(self): # NAs cause CM to zero..don't run for now csvPathname = 'hhp_9_17_12.predict.data.gz' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=30)
def test_rf_float_rand2_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 10000 write_syn_dataset(csvPathname, totalRows, headerData) for trial in range(5): rowData = rand_rowData() num = random.randint(4096, 10096) append_syn_dataset(csvPathname, num) totalRows += num start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? kwargs = {'ntrees': 5, 'max_depth': 5} parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange (10,100,10): shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange (10,100,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def test_B_randomdata2_1_lineend(self): csvPathname = 'datagen1.csv' # change lineend, case 1 csvPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) print "Using datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending" csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv' infile = open(csvPathname1, 'r') outfile = open(csvPathname2, 'w') # existing file gets erased # assume all the test files are unix lineend. # I guess there shouldn't be any "in-between" ones # okay if they change I guess. for line in infile.readlines(): outfile.write(line.strip("\n") + "\r") infile.close() outfile.close() parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=10, header=0, separator=44) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] h2o_cmd.runRF(parseResult=parseResult, trees=1, response='C' + str(numCols), timeoutSecs=20)
def test_B_randomdata2_1_lineend(self): csvPathname = 'datagen1.csv' # change lineend, case 1 csvPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) print "Using datagen1.csv to create", SYNDATASETS_DIR, "/datagen1.csv with different line ending" csvPathname2 = SYNDATASETS_DIR + '/datagen1_crlf.csv' infile = open(csvPathname1, 'r') outfile = open(csvPathname2,'w') # existing file gets erased # assume all the test files are unix lineend. # I guess there shouldn't be any "in-between" ones # okay if they change I guess. for line in infile.readlines(): outfile.write(line.strip("\n") + "\r") infile.close() outfile.close() parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=10, header=0, separator=44) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] h2o_cmd.runRF(parseResult=parseResult, trees=1, response_variable='C'+str(numCols), timeoutSecs=20)
def test_rf_params_rand2_fvec(self): h2o.beta_features = True csvPathname = "standard/covtype.data" hex_key = "covtype.data.hex" for trial in range(10): # params is mutable. This is default. params = {"ntrees": 13, "mtries": 7} colX = h2o_rf.pickRandRfParams(paramDict, params) if "cols" in params and params["cols"]: pass else: if "ignored_cols_by_name" in params and params["ignored_cols_by_name"]: params["mtries"] = random.randint(1, 53) else: params["mtries"] = random.randint(1, 54) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs["ntrees"] * 80) * max(1, kwargs["mtries"] / 60)) start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=hex_key ) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs )
def notest_B_RF_iris2(self): csvPathname = h2o.find_file('smalldata/iris/iris2.csv') h2o_cmd.runRF(trees=6, model_key="iris2", timeoutSecs=10, retryDelaySecs=1, csvPathname=csvPathname)
def test_rf_strata_fail(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') timeoutSecs = 60 kwargs = { 'response_variable': 54, 'ntree': 50, 'features': '', 'depth': 2147483647, 'stat_type': 'ENTROPY', 'ignore': '', 'class_weights': '1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0', 'sampling_strategy': 'RANDOM', 'strata_samples': 'undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined', 'sample': '67', 'out_of_bag_error_estimate': 1, 'model_key': '', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': '', 'iterative_cm': 1, 'use_non_local_data': 0, } h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
def test_hex_443(self): h2o.beta_features = True csvPathname = 'hex-443.parsetmp_1_0_0_0.data' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=1, timeoutSecs=5)
def test_GenParity1(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange(10, 100, 10): shCmdString = "perl " + parityPl + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange(10, 100, 10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # what if we do another node? # FIX! do we need or want a random delay here? h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10 sys.stdout.write('.') sys.stdout.flush()
def test_A_randomdata2(self): print "Using smalldata/datagen1.csv as is" csvPathname = h2o.find_file('smalldata/datagen1.csv') h2o_cmd.runRF(trees=1, response_variable=2, timeoutSecs=10, csvPathname=csvPathname)
def test_A_randomdata2(self): print "Using datagen1.csv as-is" csvPathname = 'datagen1.csv' # have to give the separator == comma...otherwise H2O can't deduce it on this dataset parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=10, header=1, separator=44) h2o_cmd.runRF(parseResult=parseResult, trees=1, response_variable=2, timeoutSecs=20)
def tryThemAll(self,set,rows): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first for tokenCase in range(len(self.tokenChangeDict)): newRows1 = self.changeTokens(rows,tokenCase) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1,sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname,newRows2,eol) if "'" in self.tokenChangeDict[tokenCase][0]: single_quotes = 1 else: single_quotes = 0 parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes, noPrint=not h2o.verbose) if DO_RF: h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=30, retryDelaySecs=0.1) h2o.verboseprint("Set", set) sys.stdout.write('.') sys.stdout.flush()
def test_rf_strata_fail(self): csvPathname ='UCI/UCI-large/covtype/covtype.data' timeoutSecs = 60 kwargs = { 'response_variable': 54, 'ntree': 50, 'features': '', 'depth': 2147483647, 'stat_type': 'ENTROPY', 'ignore': '', 'class_weights': '1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0', 'sampling_strategy': 'RANDOM', 'strata_samples': 'undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined', 'sample': '67', 'out_of_bag_error_estimate': 1, 'model_key': '', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': '', 'iterative_cm': 1, 'use_non_local_data': 0, } parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
def test_A_c1_fvec(self): h2o.beta_features = True parseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', schema='put', timeoutSecs=60) h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=60)
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(11, 100, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=30) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange(11, 60, 10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10
def test_poker_xlsx(self): # maybe can get stuck during polling for parse progress? # break it out for pollTimeoutSecs parseResult = h2i.import_parse( bucket="datasets", path="poker/poker-hand-testing.xlsx", schema="put", timeoutSecs=120, pollTimeoutSecs=60 ) h2o_cmd.runRF(None, parseResult=parseResult, timeoutSecs=120)
def test_rf_big_rand_tree_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowCount = 5000 colCount = 1000 write_syn_dataset(csvPathname, rowCount, colCount) for trial in range (1): # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" seed = random.randint(0,sys.maxint) # some cols can be dropped due to constant 0 or 1. make sure data set has all 0's and all 1's above # to guarantee no dropped cols! # kwargs = {'ntree': 3, 'depth': 50, 'seed': seed} # out of memory/GC errors with the above. reduce depth kwargs = {'ntrees': 3, 'max_depth': 20, 'seed': seed} start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=90) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=600, pollTimeoutSecs=180, **kwargs) print "trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) h2o_cmd.infoFromInspect(inspect, csvPathname) cols = inspect['cols'] numCols = inspect['numCols'] for i,c in enumerate(cols): colType = c['type'] self.assertEqual(colType, 'Int', msg="col %d should be type in: %s" % (i, colType)) h2o.check_sandbox_for_errors()
def test_D_GenParity1(self): trees = 50 h2o_cmd.runRF( trees=50, timeoutSecs=15, csvPathname=h2o.find_file('smalldata/parity_128_4_100_quad.data'), noise=('StoreView', None))
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ( (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15) * (kwargs['parallel'] and 1 or 3)) start = time.time() h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs)
def test_C_RF_poker100(self): parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10) SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 60 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) trees += 10
def tryThemAll(self, set, rows, enumsOnly=False): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first if enumsOnly: tcd = self.tokenChangeDict else: tcd = self.tokenChangeDictEnumsOnly for tokenCase in range(len(tcd)): newRows1 = self.changeTokens(rows, tokenCase, tcd) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1,sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname,newRows2,eol) if "'" in tcd[tokenCase][0]: singleQuotes = 1 else: singleQuotes = 0 parseResult = h2i.import_parse(path=csvPathname, schema='local', singleQuotes=singleQuotes, noPrint=not h2o_args.verbose, retryDelaySecs=0.1, doSummary=DO_SUMMARY, intermediateResults=DO_INTERMEDIATE_RESULTS) if DO_RF: h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=True) verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_badchars(self): print "badchars.csv has some 0x0 (<NUL>) characters." print "They were created by a dd that filled out to buffer boundary with <NUL>" print "They are visible using vim/vi" csvPathname = h2o.find_file('smalldata/badchars.csv') h2o_cmd.runRF(trees=50, timeoutSecs=10, csvPathname=csvPathname)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = ( "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR ) h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 3): sys.stdout.write(".") sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + "/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30 ) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds" print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def tryThemAll(self,set,rows): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first for tokenCase in range(len(self.tokenChangeDict)): newRows1 = self.changeTokens(rows,tokenCase) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1,sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname,newRows2,eol) parseResult = h2i.import_parse(path=csvPathname, schema='local', noPrint=not h2o.verbose) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] self.assertEqual(num_cols, 4, "Parsed wrong number of cols: %s" % num_cols) self.assertEqual(num_rows, 29, "Parsed wrong number of rows: %s" % num_rows) h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=10, retryDelaySecs=1.0, noPrint=True) h2o.verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_rf_1ktrees_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [500]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def tryThemAll(self, set, rows, enumsOnly=False): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first if enumsOnly: tcd = self.tokenChangeDict else: tcd = self.tokenChangeDictEnumsOnly for tokenCase in range(len(tcd)): newRows1 = self.changeTokens(rows, tokenCase, tcd) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1,sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname,newRows2,eol) parseResult = h2i.import_parse(path=csvPathname, schema='put', noPrint=not h2o.verbose) h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=False) h2o.verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def tryThemAll(self, set, rows): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first for tokenCase in range(len(self.tokenChangeDict)): newRows1 = self.changeTokens(rows, tokenCase) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1, sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname, newRows2, eol) if "'" in self.tokenChangeDict[tokenCase]: single_quotes = 1 else: single_quotes = 0 parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes, noPrint=not h2o.verbose) h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=30, retryDelaySecs=0.1) h2o.verboseprint("Set", set) sys.stdout.write('.') sys.stdout.flush()
def test_1ktrees_job_cancel_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "Kick off twenty, then cancel them all..there's a timeout on the wait after cancelling" for trial in range (1,20): h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, depth=50, rfView=False, noPoll=True, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2o_jobs.cancelAllJobs(timeoutSecs=10)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange (1,10,1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o.verboseprint("Trial", trial) h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_cs_test(self): parseResult = h2i.import_parse(bucket='smalldata', path='kaggle/creditsample-training.csv.gz', schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=5, max_depth=100, timeoutSecs=500, response='SeriousDlqin2yrs') # h2b.browseJsonHistoryAsUrlLastMatch("RFView") time.sleep(5)
def test_rf_big1_nopoll_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=30, schema='put') rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) kwargs['ntrees'] = 1 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntrees'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=False if OVERWRITE_RF_MODEL else True, **kwargs) print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] ntree = rfView['ntree'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(50, 200, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # bump this up too if you do? # always match the gen above! ### for x in xrange (50,200,10): for x in xrange(50, 200, 10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + "100" + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename h2o_cmd.runRF(csvPathname=csvPathname, trees=100, timeoutSecs=5, retryDelaySecs=0.1)
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(11, 100, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange(11, 60, 10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) trees += 10
def test_RFhhp(self): csvPathnamegz = h2o.find_file('smalldata/hhp_107_01.data.gz') print "\nRF start on ", csvPathnamegz, "this will probably take a minute.." start = time.time() h2o_cmd.runRF(csvPathname=csvPathnamegz, trees=23, timeoutSecs=120, retryDelaySecs=10) print "RF end on ", csvPathnamegz, 'took', time.time() - start, 'seconds'
def tryThemAll(self, set, rows): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first for tokenCase in range(len(self.tokenChangeDict)): newRows1 = self.changeTokens(rows, tokenCase) for sepCase in range(len(self.sepChangeDict)): newRows2 = self.changeSep(newRows1, sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname, newRows2, eol) h2o_cmd.runRF(trees=1, csvPathname=csvPathname, timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=False) h2o.verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(11, 100, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange(11, 60, 10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # what if we do another node? # FIX! do we need or want a random delay here? h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10 sys.stdout.write('.') sys.stdout.flush()
def test_1ktrees_job_cancel_many_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range (1,5): # random 0 or 1 delay delay = random.uniform(0,1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def notest_RF_iris2(self): trees = 6 timeoutSecs = 20 csvPathname = h2o.find_file('smalldata/iris/iris2.csv') h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname)
def test_rf3_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in range (1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseResult = h2i.import_parse(path=csvPathname, schema='put', pollTimeoutSecs=60, timeoutSecs=60) h2o.verboseprint("Trial", trial) h2o_cmd.runRF(parseResult=parseResult, trees=237, max_depth=45, timeoutSecs=480) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def notest_RF_poker100(self): trees = 6 timeoutSecs = 20 csvPathname = h2o.find_file('smalldata/poker/poker100') h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname)
def test_B_c1_fvec(self): print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here" print "Want to be able to run python as jenkins" print "I guess for big 0xcust files, we don't need schema='put'" print "For files that we want to put (for testing put), we can get non-private files" parseResult = h2i.import_parse(bucket='0xcustomer-datasets', path='c1/iris2.csv', schema='local', timeoutSecs=60) h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=60)
def test_rf_params_rand2_ncaa(self): csvPathname = 'ncaa/Players.csv' for trial in range(4): # params is mutable. This is default. params = {'ntree': 13, 'features': 4} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + ( (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15)) # hack to NA the header (duplicate header names) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', header=0) start = time.time() h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs)
def tryThemAll(self, set, rows): for eolCase in range(len(self.eolDict)): eol = self.eolDict[eolCase] # change tokens must be first for tokenCase in range(len(self.tokenChangeDict)): newRows1 = self.changeTokens(rows, tokenCase) for sepCase in range(len(self.sepChangeDict)): (newSep, newRows2) = self.changeSep(newRows1, sepCase) csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \ str(set) + "_" + \ str(eolCase) + "_" + \ str(tokenCase) + "_" + \ str(sepCase) + \ '.data' self.writeRows(csvPathname, newRows2, eol) # give h2o the separator, to be nice. (integerized) parseResult = h2i.import_parse(path=csvPathname, schema='put', separator=ord(newSep), noPrint=not h2o.verbose) h2o_cmd.runRF(parseResult=parseResult, trees=1, response_variable='C1', timeoutSecs=10, retryDelaySecs=0.1, noPrint=True) h2o.verboseprint("Set", set) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_rf_float_rand2_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_prostate.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 10000 write_syn_dataset(csvPathname, totalRows, headerData) for trial in range (5): rowData = rand_rowData() num = random.randint(4096, 10096) append_syn_dataset(csvPathname, num) totalRows += num start = time.time() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = csvFilename + "_" + str(trial) + ".hex" # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ?? kwargs = {'ntrees': 5, 'max_depth': 5} parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \ 'took', time.time() - start, 'seconds' ### h2o_cmd.runInspect(key=hex_key) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_poker_xlsx(self): parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path='xls/poker-hand-testing.xlsx', schema='put', timeoutSecs=120, pollTimeoutSecs=60) h2o_cmd.runRF(None, parseResult=parseResult, timeoutSecs=120)
def notest_C_RF_poker100(self): # RFview consumes cycles. Only retry once a second, to avoid slowing things down csvPathname = h2o.find_file('smalldata/poker/poker100') h2o_cmd.runRF(trees=6, model_key="poker100", timeoutSecs=10, retryDelaySecs=1, csvPathname=csvPathname)
def test_cs_training(self): h2o_cmd.runRF(trees=100, depth=100, csvPathname=h2o.find_file( 'smalldata/kaggle/creditsample-training.csv.gz'), timeoutSecs=300, response_variable=1) h2b.browseJsonHistoryAsUrlLastMatch("RFView")
def test_stedo_testing_data(self): csvPathname = h2o.find_file('smalldata/stego/stego_training.data') # Prediction class is the second column => class=1 h2o_cmd.runRF(trees=50, timeoutSecs=30, csvPathname=csvPathname, response_variable=1, out_of_bag_error_estimate=1)
def test_badchars(self): print "badchars.csv has some 0x0 (<NUL>) characters." print "They were created by a dd that filled out to buffer boundary with <NUL>" print "They are visible using vim/vi" csvPathname = 'badchars.csv' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=50, timeoutSecs=10)
def test_tree_view(self): parseResult = h2i.import_parse( bucket="smalldata", path="poker/poker1000", hex_key="poker1000.hex", schema="put" ) h2o_cmd.runRF(parseResult=parseResult, trees=50, model_key="model0", timeoutSecs=10) for n in range(1): a = h2o_cmd.runRFTreeView(n=n, data_key="poker1000.hex", model_key="model0", timeoutSecs=10) print(h2o.dump_json(a))
def test_RFhhp(self): csvPathname = 'hhp.cut3.214.data.gz' print "RF start on ", csvPathname, "this will probably take 1 minute.." start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=10, timeoutSecs=400, retryDelaySecs=15) print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_rf_200x4_fvec(self): csvPathname = 'hhp.cut3.214.data.gz' print "RF start on ", csvPathname start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=3, timeoutSecs=1200, retryDelaySecs=15) print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_rf_params_rand1_fvec(self): csvPathname = 'poker/poker1000' for trial in range(10): # params is mutable. This is default. kwargs = params.copy() timeoutSecs = 180 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"