def test_rf_1ktrees_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [500]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_C_RF_poker100(self): parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker100', schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=10) SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=30) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 60 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) trees += 10
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in xrange (1,3,1): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF key = h2o.nodes[0].put_file(csvPathname) parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex") h2o.verboseprint("Trial", trial) start = time.time() cmd.runRFOnly(parseKey=parseKey, trees=10000, depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds'
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(11, 100, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=30) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange(11, 60, 10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(11, 100, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange(11, 60, 10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # what if we do another node? # FIX! do we need or want a random delay here? h2o_cmd.runRF(trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10 sys.stdout.write('.') sys.stdout.flush()
def test_rf3_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in range (1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseResult = h2i.import_parse(path=csvPathname, schema='put', pollTimeoutSecs=60, timeoutSecs=60) h2o.verboseprint("Trial", trial) h2o_cmd.runRF(parseResult=parseResult, trees=237, max_depth=45, timeoutSecs=480) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange(1, 10, 1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseKey = h2o_cmd.parseFile(None, csvPathname) h2o.verboseprint("Trial", trial) h2o_cmd.runRFOnly(parseKey=parseKey, trees=237, depth=45, timeoutSecs=120) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(11, 100, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange(11, 60, 10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, ntrees=trees, timeoutSecs=timeoutSecs) trees += 10
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time() - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in [10000]: # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! trial = 1 for x in xrange (1,10,1): sys.stdout.write('.') sys.stdout.flush() # just use one file for now csvFilename = "parity_128_4_" + str(10000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o.verboseprint("Trial", trial) h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480) # don't change tree count yet ## trees += 10 ### timeoutSecs += 2 trial += 1
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # what if we do another node? # FIX! do we need or want a random delay here? h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10 sys.stdout.write('.') sys.stdout.flush()
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = ( "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR ) h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 3): sys.stdout.write(".") sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + "/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30 ) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds" print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(50, 200, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # bump this up too if you do? # always match the gen above! ### for x in xrange (50,200,10): for x in xrange(50, 200, 10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + "100" + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename h2o_cmd.runRF(csvPathname=csvPathname, trees=100, timeoutSecs=5, retryDelaySecs=0.1)
def test_1ktrees_job_cancel_many_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range (1,5): # random 0 or 1 delay delay = random.uniform(0,1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def test_1ktrees_job_cancel_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "Kick off twenty, then cancel them all..there's a timeout on the wait after cancelling" for trial in range (1,20): h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, depth=50, rfView=False, noPoll=True, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2o_jobs.cancelAllJobs(timeoutSecs=10)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange (10,100,10): shCmdString = "perl " + parityPl + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange (10,100,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def test_GenParity1(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() parityPl = h2o.find_file('syn_scripts/parity.pl') # two row dataset gets this. Avoiding it for now # java.lang.ArrayIndexOutOfBoundsException: 1 # at hex.rf.Data.sample_fair(Data.java:149) # always match the run below! print "\nAssuming two row dataset is illegal. avoiding" for x in xrange(10, 100, 10): shCmdString = "perl " + parityPl + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split()) # algorithm for creating the path and filename is hardwired in parity.pl. csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # FIX! we fail if min is 3 for x in xrange(10, 100, 10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10 timeoutSecs += 2
def test_rf_1ktrees_job_cancel_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, rfView=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] print "model_key:", model_key # FIX! need to get more intelligent here a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # this is the wrong key to ancel with # "destination_key": "pytest_model", print "cancelling with a bad key" b = h2o.nodes[0].jobs_cancel(key=model_key) print "jobs_cancel():", h2o.dump_json(b)
def test_B_GenParity1(self): # Create a directory for the created dataset files. ok if already exists SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR, print "\nmay be a minute.........." for x in xrange(161, 240, 20): # more rows! y = 10000 * x # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(y) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. # UPDATE: maybe EC2 takes a long time to spawn a process? h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), timeout=90) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" sys.stdout.write('.') sys.stdout.flush() print "\nDatasets generated. Using." # always match the gen above! # Let's try it twice! for trials in xrange(1, 7): # prime trees = 6 for x in xrange(161, 240, 20): y = 10000 * x print "\nTrial:", trials, ", y:", y csvFilename = "parity_128_4_" + str(y) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # random guess about length of time, varying with more hosts/nodes? timeoutSecs = 20 + 5 * (len(h2o.nodes)) # change the model name each iteration, so they stay in h2o model_key = csvFilename + "_" + str(trials) h2o_cmd.runRF(trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname) sys.stdout.write('.') sys.stdout.flush() # partial clean, so we can look at tree builds from this run if hang h2o.clean_sandbox_stdout_stderr()
def test_1ktrees_job_cancel_many_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "kick off jobs, then cancel them" for trial in range(1, 50): # random 0 or 1 delay delay = random.uniform(0, 1) time.sleep(delay) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' ### h2o_jobs.cancelAllJobs(timeoutSecs=10) h2o.check_sandbox_for_errors() # do one last good one rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=False, timeoutSecs=600, retryDelaySecs=3) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
def test_B_GenParity1(self): # Create a directory for the created dataset files. ok if already exists SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR, print "\nmay be a minute.........." for x in xrange(161, 240, 20): # more rows! y = 10000 * x # Have to split the string out to list for pipe shCmdString = ( "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(y) + " quad " + SYNDATASETS_DIR ) # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. # UPDATE: maybe EC2 takes a long time to spawn a process? h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), timeout=90) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" sys.stdout.write(".") sys.stdout.flush() print "\nDatasets generated. Using." # always match the gen above! # Let's try it twice! for trials in xrange(1, 7): # prime trees = 6 for x in xrange(161, 240, 20): y = 10000 * x print "\nTrial:", trials, ", y:", y csvFilename = "parity_128_4_" + str(y) + "_quad.data" csvPathname = SYNDATASETS_DIR + "/" + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # random guess about length of time, varying with more hosts/nodes? timeoutSecs = 20 + 5 * (len(h2o.nodes)) # change the model name each iteration, so they stay in h2o model_key = csvFilename + "_" + str(trials) parseResult = h2i.import_parse(path=csvPathname, schema="put") h2o_cmd.runRF( parseResult=parseResult, trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1 ) sys.stdout.write(".") sys.stdout.flush() # partial clean, so we can look at tree builds from this run if hang h2o.clean_sandbox_stdout_stderr()
def test_rf_1ktrees_job_cancel_3_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 20): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=2, rfView=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' # FIX! need to get more intelligent here time.sleep(1) a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # "destination_key": "pytest_model", # FIX! using 'key': 'pytest_model" with no time delay causes a failure time.sleep(1) jobsList = a['jobs'] for j in jobsList: b = h2o.nodes[0].jobs_cancel(key=j['key']) print "jobs_cancel():", h2o.dump_json(b)
def test_rf_1ktrees_job_cancel_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() # without rfview, do we get the 'first" rf json? rfv = h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, rfView=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' # rf_model = rfv['drf_model'] # data_key = rf_model['_dataKey'] # model_key = rf_model['_key'] data_key = rfv['source']['_key'] model_key = rfv['destination_key'] print "model_key:", model_key # FIX! need to get more intelligent here a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # this is the wrong key to ancel with # "destination_key": "pytest_model", print "cancelling with a bad key" b = h2o.nodes[0].jobs_cancel(key=model_key) print "jobs_cancel():", h2o.dump_json(b)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in xrange(1, 20, 1): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # broke out the put separately so we can iterate a test just on the RF key = h2o.nodes[0].put_file(csvPathname) parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex") h2o.verboseprint("Trial", trial) start = time.time() # rfview=False used to inhibit the rfview completion h2o_cmd.runRFOnly(parseKey=parseKey, trees=trial, depth=2, rfview=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' # FIX! need to get more intelligent here time.sleep(1) a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # "destination_key": "pytest_model", # FIX! using 'key': 'pytest_model" with no time delay causes a failure time.sleep(1) jobsList = a['jobs'] for j in jobsList: b = h2o.nodes[0].jobs_cancel(key=j['key']) print "jobs_cancel():", h2o.dump_json(b)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, rfView=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time( ) - start, 'seconds' model_key = rfResult['model_key'] print "model_key:", model_key # FIX! need to get more intelligent here a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # this is the wrong key to ancel with # "destination_key": "pytest_model", print "cancelling with a bad key" b = h2o.nodes[0].jobs_cancel(key=model_key) print "jobs_cancel():", h2o.dump_json(b)
def test_D_GenParity1(self): # Create a directory for the created dataset files. ok if already exists SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR, print "\nmay be a minute.........." for x in xrange (161,240,20): # more rows! y = 10000 * x # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(y) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. # large row counts. need more time h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=90) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" sys.stdout.write('.') sys.stdout.flush() print "\nDatasets generated. Using." # always match the gen above! # Let's try it twice! for trials in xrange(1,7): # prime trees = 4057 trees = 6 for x in xrange (161,240,20): y = 10000 * x print "\nTrial:", trials, ", y:", y csvFilename = "parity_128_4_" + str(y) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # FIX! TBD do we always have to kick off the run from node 0? # random guess about length of time, varying with more hosts/nodes? timeoutSecs = 30 + trees*(len(h2o.nodes)) # change the model name each iteration, so they stay in h2o model_key = csvFilename + "_" + str(trials) h2o_cmd.runRF(trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname) sys.stdout.write('.') sys.stdout.flush()
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename h2o_cmd.runRF( trees=trees, timeoutSecs=timeoutSecs, csvPathname=csvPathname) trees += 10
def test_rf_10ktrees_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range(1, 3): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=10000, max_depth=2, timeoutSecs=900, retryDelaySecs=3) print "RF #", trial, "end on ", csvFilename, 'took', time.time( ) - start, 'seconds' print "Waiting 60 secs for TIME_WAIT sockets to go away" time.sleep(60)
def test_rf_1ktrees_job_cancel_3_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,20): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=2, rfView=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' # FIX! need to get more intelligent here time.sleep(1) a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # "destination_key": "pytest_model", # FIX! using 'key': 'pytest_model" with no time delay causes a failure time.sleep(1) jobsList = a['jobs'] for j in jobsList: b = h2o.nodes[0].jobs_cancel(key=j['key']) print "jobs_cancel():", h2o.dump_json(b)
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange(50, 200, 10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" # bump this up too if you do? # always match the gen above! ### for x in xrange (50,200,10): for x in xrange(50, 200, 10): sys.stdout.write(".") sys.stdout.flush() csvFilename = "parity_128_4_" + "100" + "_quad.data" csvPathname = SYNDATASETS_DIR + "/" + csvFilename h2o_cmd.runRF(csvPathname=csvPathname, trees=100, timeoutSecs=5, retryDelaySecs=0.1)
def test_D_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! for x in xrange (11,100,10): # Have to split the string out to list for pipe shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR # FIX! as long as we're doing a couple, you'd think we wouldn't have to # wait for the last one to be gen'ed here before we start the first below. h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),timeout=3) # the algorithm for creating the path and filename is hardwired in parity.pl..i.e csvFilename = "parity_128_4_" + str(x) + "_quad.data" trees = 6 timeoutSecs = 20 # always match the gen above! # reduce to get intermittent failures to lessen, for now for x in xrange (11,60,10): csvFilename = "parity_128_4_" + str(x) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, trees=trees, timeoutSecs=timeoutSecs) trees += 10
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad" h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in range (1,5): sys.stdout.write('.') sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + '/' + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30) h2o.verboseprint("Trial", trial) start = time.time() rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, rfView=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, 'took', time.time() - start, 'seconds' model_key = rfResult['model_key'] print "model_key:", model_key # FIX! need to get more intelligent here a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # this is the wrong key to ancel with # "destination_key": "pytest_model", print "cancelling with a bad key" b = h2o.nodes[0].jobs_cancel(key=model_key) print "jobs_cancel():", h2o.dump_json(b)
def test_GenParity1(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [1000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad" h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" # always match the gen above! for trial in xrange(1, 5, 1): sys.stdout.write(".") sys.stdout.flush() csvFilename = "parity_128_4_" + str(1000) + "_quad.data" csvPathname = SYNDATASETS_DIR + "/" + csvFilename # broke out the put separately so we can iterate a test just on the RF key = h2o.nodes[0].put_file(csvPathname) parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex") h2o.verboseprint("Trial", trial) start = time.time() # rfview=False used to inhibit the rfview completion h2o_cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, rfview=False, timeoutSecs=600, retryDelaySecs=3) print "RF #", trial, "started on ", csvFilename, "took", time.time() - start, "seconds" # FIX! need to get more intelligent here a = h2o.nodes[0].jobs_admin() print "jobs_admin():", h2o.dump_json(a) # this is the wrong key to ancel with # "destination_key": "pytest_model", print "cancelling with a bad key" b = h2o.nodes[0].jobs_cancel(key="pytest_model") print "jobs_cancel():", h2o.dump_json(b)
def test_rf_parity_cmp(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [50000]: shCmdString = "perl " + h2o.find_file( "syn_scripts/parity.pl") + " 128 4 " + str( x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" def doBoth(): h2o.verboseprint("Trial", trial) start = time.time() # make sure ntrees and max_depth are the same for both rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed1 = time.time() - start (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed2 = time.time() - start (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) print "Checking that results are similar (within 20%)" print "DRF2 then SpeeDRF" print "per-class variance is large..basically we can't check very well for this dataset" for i, (j, k) in enumerate(zip(classErrorPctList1, classErrorPctList2)): print "classErrorPctList[%s]:i %s %s" % (i, j, k) # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], # delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i) print "totalError: %s %s" % (totalError1, totalError2) self.assertAlmostEqual( totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF") print "elapsed: %s %s" % (elapsed1, elapsed2) self.assertAlmostEqual( elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF") # always match the gen above! for trial in range(1): csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse( path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) numCols = inspect['numCols'] numRows = inspect['numRows'] response = "C" + str(numCols) ntrees = 30 doBoth() print "*****************************" print "end # %s RF compare" % trial, print "*****************************" print "Now change all cols to enums" for e in range(numCols): enumResult = h2o.nodes[0].to_enum(src_key=hex_key, column_index=(e + 1)) doBoth() print "*********************************" print "end # %s RF compare, with enums #" % trial, print "*********************************"
def test_rf_parity_cmp(self): SYNDATASETS_DIR = h2o.make_syn_dir() # always match the run below! # just using one file for now for x in [50000]: shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4) csvFilename = "parity_128_4_" + str(x) + "_quad.data" def doBoth(): h2o.verboseprint("Trial", trial) start = time.time() # make sure ntrees and max_depth are the same for both rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed1 = time.time() - start (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response, timeoutSecs=600, retryDelaySecs=3) elapsed2 = time.time() - start (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView) print "Checking that results are similar (within 20%)" print "DRF2 then SpeeDRF" print "per-class variance is large..basically we can't check very well for this dataset" for i, (j,k) in enumerate(zip(classErrorPctList1, classErrorPctList2)): print "classErrorPctList[%s]:i %s %s" % (i, j, k) # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], # delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i) print "totalError: %s %s" % (totalError1, totalError2) self.assertAlmostEqual(totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF") print "elapsed: %s %s" % (elapsed1, elapsed2) self.assertAlmostEqual(elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF") # always match the gen above! for trial in range (1): csvPathname = SYNDATASETS_DIR + '/' + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) numCols = inspect['numCols'] numRows = inspect['numRows'] response = "C" + str(numCols) ntrees = 30 doBoth() print "*****************************" print "end # %s RF compare" % trial, print "*****************************" print "Now change all cols to enums" for e in range(numCols): enumResult = h2o.nodes[0].to_enum(src_key=hex_key, column_index=(e+1)) doBoth() print "*********************************" print "end # %s RF compare, with enums #" % trial, print "*********************************"