def test_rf_params_rand2(self): csvPathname = 'space_shuttle_damage.csv' for trial in range(10): # params is mutable. This is default. params = { 'sample': 80, 'stat_type': 'ENTROPY', 'class_weights': 'yes=1000', 'ntree': 50, 'response_variable': 'damage', 'ignore': 'flight', 'ntree': 25, 'out_of_bag_error_estimate': 1, } print "params:", params colX = h2o_rf.pickRandRfParams(paramDict, params) print "params:", params kwargs = params.copy() timeoutSecs = 180 start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time()-start # just to get the list of per class errors (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True) print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" # why does this vary between 22 and 23 self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes' self.assertLess(classErrorPctList[0],95) # class 0 is 'no' self.assertLess(classErrorPctList[1],29) # class 1 is 'yes' self.assertLess(classification_error,61)
def test_rf_params_rand2_fvec(self): h2o.beta_features = True csvPathname = "standard/covtype.data" hex_key = "covtype.data.hex" for trial in range(10): # params is mutable. This is default. params = {"ntrees": 13, "mtries": 7} colX = h2o_rf.pickRandRfParams(paramDict, params) if "cols" in params and params["cols"]: pass else: if "ignored_cols_by_name" in params and params["ignored_cols_by_name"]: params["mtries"] = random.randint(1, 53) else: params["mtries"] = random.randint(1, 54) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs["ntrees"] * 80) * max(1, kwargs["mtries"] / 60)) start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=hex_key ) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs )
def test_rf_params_rand2_ncaa(self): csvPathname = 'ncaa/Players.csv' for trial in range(4): # params is mutable. This is default. params = {'ntree': 13, 'features': 4} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + ( (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15)) # hack to NA the header (duplicate header names) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', header=0) start = time.time() h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs)
def test_rf_params_rand2(self): csvPathname = h2o.find_file('smalldata/space_shuttle_damage.csv') for trial in range(10): # params is mutable. This is default. params = { 'sample': 80, 'stat_type': 'ENTROPY', 'class_weights': 'yes=1000', 'ntree': 50, 'parallel': 1, 'response_variable': 'damage', 'ignore': 'flight', 'ntree': 25, 'out_of_bag_error_estimate': 1, } print "params:", params colX = h2o_rf.pickRandRfParams(paramDict, params) print "params:", params kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10) start = time.time() rfView = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start # just to get the list of per class errors (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True) print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" # why does this vary between 22 and 23 self.assertAlmostEqual(totalScores,23,delta=1) # class 1 is 'yes' self.assertLess(classErrorPctList[0],95) # class 0 is 'no' self.assertLess(classErrorPctList[1],29) # class 1 is 'yes' self.assertLess(classification_error,61)
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ( (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15) * (kwargs['parallel'] and 1 or 3)) start = time.time() h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs)
def test_rf_params_rand2_7066883810153380318(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 23, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_loop_random_param_poker1000(self): csvPathname = h2o.find_file('smalldata/poker/poker1000') for trial in range(20): # params is mutable. This is default. params = {'ntree': 19, 'parallel': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_loop_random_param_covtype(self): csvPathname = 'UCI/UCI-large/covtype/covtype.data' for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put') h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_loop_random_param_poker1000(self): csvPathname = 'poker/poker1000' for trial in range(20): # params is mutable. This is default. params = {'ntree': 19, 'parallel': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) start = time.time() h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_params_rand1(self): csvPathname = "poker/poker1000" for trial in range(10): # params is mutable. This is default. params = {"ntree": 63, "use_non_local_data": 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees print kwargs # slower if parallel=0 timeoutSecs = 30 + kwargs["ntree"] * 6 parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="put", timeoutSecs=timeoutSecs) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand1_fvec(self): csvPathname = 'poker/poker1000' params = {'ntrees': 2} for trial in range(10): colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees print kwargs # slower if parallel=0 timeoutSecs = 30 + kwargs['ntrees'] * 6 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key='poker1000.hex', schema='put', timeoutSecs=timeoutSecs) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand1(self): csvPathname = 'poker/poker1000' for trial in range(10): # params is mutable. This is default. params = {'ntree': 63, 'parallel': 1, 'use_non_local_data': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees print kwargs # slower if parallel=0 timeoutSecs = 30 + kwargs['ntree'] * 6 * (kwargs['parallel'] and 1 or 5) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand2(self): csvPathname = 'standard/covtype.data' for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_params_rand2(self): csvPathname = h2o.find_file('/home/0xdiag/datasets/ncaa/Players.csv') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 4} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) start = time.time() h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_params_rand1_fvec(self): h2o.beta_features = True csvPathname = 'poker/poker1000' for trial in range(10): # params is mutable. This is default. params = {'ntrees': 63} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees print kwargs # slower if parallel=0 timeoutSecs = 30 + kwargs['ntrees'] * 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand1_fvec(self): h2o.beta_features = True csvPathname = 'poker/poker1000' params = {'ntrees': 2} for trial in range(10): colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees print kwargs # slower if parallel=0 timeoutSecs = 30 + kwargs['ntrees'] * 6 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key='poker1000.hex', schema='put', timeoutSecs=timeoutSecs) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') # for determinism, I guess we should spit out the seed? ##### SEED = random.randint(0, sys.maxint) # if you have to force to redo a test SEED = 4201285065147091758 random.seed(SEED) print "\nUsing random seed:", SEED for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_loop_random_param_poker1000(self): csvPathname = 'poker/poker1000' for trial in range(20): # params is mutable. This is default. params = {'ntree': 19} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + kwargs['ntree'] * 10 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand2_ncaa(self): csvPathname = 'ncaa/Players.csv' for trial in range(4): # params is mutable. This is default. params = {'ntree': 13, 'features': 4} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15)) # hack to NA the header (duplicate header names) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', header=0) start = time.time() h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_params_rand2(self): csvPathname = "ncaa/Players.csv" for trial in range(4): # params is mutable. This is default. params = {"ntree": 13, "features": 4} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + ((kwargs["ntree"] * 20) * max(1, kwargs["features"] / 15)) parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, schema="put") start = time.time() h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs )
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(20): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10) h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand2_7066883810153380318(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') # for determinism, I guess we should spit out the seed? # random.seed(SEED) # SEED = random.randint(0, sys.maxint) # if you have to force to redo a test SEED = 7066883810153380318 random.seed(SEED) print "\nUsing random seed:", SEED for trial in range(10): # params is mutable. This is default. params = {'ntree': 23, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand2_7066883810153380318(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') # for determinism, I guess we should spit out the seed? # random.seed(SEED) # SEED = random.randint(0, sys.maxint) # if you have to force to redo a test SEED = 7066883810153380318 random.seed(SEED) print "\nUsing random seed:", SEED for trial in range(20): # params is mutable. This is default. params = {'ntree': 23, 'parallel': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 3) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand1(self): csvPathname = 'poker/poker1000' for trial in range(10): # params is mutable. This is default. params = {'ntree': 63, 'use_non_local_data': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees print kwargs # slower if parallel=0 timeoutSecs = 30 + kwargs['ntree'] * 6 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "Trial #", trial, "completed"
def test_loop_random_param_poker1000(self): csvPathname = h2o.find_file('smalldata/poker/poker1000') # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED for trial in range(20): # params is mutable. This is default. params = {'ntree': 19, 'parallel': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) start = time.time() h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_params_rand2(self): csvPathname = 'space_shuttle_damage.csv' for trial in range(10): # params is mutable. This is default. params = { 'sample': 80, 'stat_type': 'ENTROPY', 'class_weights': 'yes=1000', 'ntree': 50, 'response_variable': 'damage', 'ignore': 'flight', 'ntree': 25, 'out_of_bag_error_estimate': 1, } print "params:", params colX = h2o_rf.pickRandRfParams(paramDict, params) print "params:", params kwargs = params.copy() timeoutSecs = 180 start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start # just to get the list of per class errors (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noPrint=True) print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs), "\n" # why does this vary between 22 and 23 self.assertAlmostEqual(totalScores, 23, delta=1) # class 1 is 'yes' self.assertLess(classErrorPctList[0], 95) # class 0 is 'no' self.assertLess(classErrorPctList[1], 29) # class 1 is 'yes' self.assertLess(classification_error, 61)
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathname = h2o.find_file('smalldata/space_shuttle_damage.csv') for trial in range(10): # params is mutable. This is default. params = { 'sample': 80, 'gini': 0, 'class_weights': 'yes=1000', 'ntree': 50, 'parallel': 1, 'response_variable': 'damage', 'ignore': 'flight', 'ntree': 25, 'out_of_bag_error_estimate': 1, } print "params:", params colX = h2o_rf.pickRandRfParams(paramDict, params) print "params:", params kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 6 or 10) start = time.time() rfView = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start # just to get the list of per class errors (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, noprint=True) print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" self.assertEqual(totalScores,23) # class 1 is 'yes' self.assertLess(classErrorPctList[0],82) # class 0 is 'no' self.assertLess(classErrorPctList[1],29) # class 1 is 'yes' self.assertLess(classification_error,61)
def test_rf_params_rand2_fvec(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' hex_key = 'covtype.data.hex' for trial in range(2): # params is mutable. This is default. params = { 'ntrees': 13, 'mtries': 7, 'balance_classes': 0, 'importance': 0 } colX = h2o_rf.pickRandRfParams(paramDict, params) if 'cols' in params and params['cols']: pass else: if 'ignored_cols_by_name' in params and params[ 'ignored_cols_by_name']: params['mtries'] = random.randint(1, 53) else: params['mtries'] = random.randint(1, 54) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ( (kwargs['ntrees'] * 80) * max(1, kwargs['mtries'] / 60)) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key) h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs)
def test_rf_params_rand2(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' for trial in range(10): # params is mutable. This is default. params = {'ntrees': 13, 'mtries': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) if 'cols' in params and params['cols']: pass else: if 'ignored_cols_by_name' in params and params['ignored_cols_by_name']: params['mtries'] = random.randint(1,53) else: params['mtries'] = random.randint(1,54) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntrees']*80) * max(1,kwargs['mtries']/60) ) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rfview_score(self): csvPathnameTrain = 'UCI/UCI-large/covtype/covtype.data' print "Train with:", csvPathnameTrain parseResultTrain = h2i.import_parse(bucket='datasets', path=csvPathnameTrain, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTrain = parseResultTrain['destination_key'] csvPathnameTest = 'UCI/UCI-large/covtype/covtype.data' print "Test with:", csvPathnameTest parseResultTest = h2i.import_parse(bucket='datasets', path=csvPathnameTest, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTest = parseResultTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # new web page for predict? throw it in here for now (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if 'sampling_strategy' in kwargs and kwargs['sampling_strategy'] != 'STRATIFIED_LOCAL': check_err = True else: check_err = False if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_rf_covtype20x(self): importFolderPath = 'standard' csvFilenameTrain = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTrain hex_key = 'covtype20x.data.A.hex' parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print csvFilenameTrain, 'parse time:', parseResultTrain['response']['time'] inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest hex_key = 'covtype20x.data.B.hex' parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print csvFilenameTest, 'parse time:', parseResultTest['response']['time'] print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] dataKeyTest2 = 'covtype20x.data.C.hex' print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=15) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?" params = { 'ntree': 6, 'parallel': 1, 'out_of_bag_error_estimate': 0, # Causes rest api illegal argument error. # 'no_confusion_matrix': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5) start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=500, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_rfview_score(self): csvPathnameTrain = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "Train with:", csvPathnameTrain parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15) dataKeyTrain = parseKeyTrain['destination_key'] csvPathnameTest = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "Test with:", csvPathnameTest parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15) dataKeyTest = parseKeyTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # new web page for predict? throw it in here for now start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest kwargs = {'str': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. paramDict = drf2ParamDict params = {'ntrees': 20, 'destination_key': 'RF_model'} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual( classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C55', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_rf_change_data_key(self): importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvFilenameTrain = 'covtype.data' parseKeyTrain = h2i.parseImportFolderFile(None, csvFilenameTrain, importFolderPath, timeoutSecs=500) print csvFilenameTrain, 'parse time:', parseKeyTrain['response']['time'] inspect = h2o_cmd.runInspect(key=parseKeyTrain['destination_key']) dataKeyTrain = parseKeyTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseKey = parseKey # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' parseKeyTest = h2i.parseImportFolderFile(None, csvFilenameTest, importFolderPath, timeoutSecs=500) print csvFilenameTest, 'parse time:', parseKeyTest['response']['time'] print "Parse result['destination_key']:", parseKeyTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseKeyTest['destination_key']) dataKeyTest = parseKeyTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?" params = { 'ntree': 6, 'parallel': 1, 'out_of_bag_error_estimate': 0, 'no_confusion_matrix': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5) start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=1, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest if h2o.beta_features: kwargs = {'str': execExpr, 'timeoutSecs': 15} else: kwargs = {'expression': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. if h2o.beta_features: paramDict = drf2ParamDict params = { 'ntrees': 20, 'destination_key': 'RF_model' } else: paramDict = drf1ParamDict params = { 'ntree': 20, 'out_of_bag_error_estimate': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() if h2o.beta_features: timeoutSecs = 30 + kwargs['ntrees'] * 60 else: timeoutSecs = 30 + kwargs['ntree'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" if h2o.beta_features: model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] else: model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C54', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_rfview_score(self): csvPathnameTrain = h2o.find_dataset( 'UCI/UCI-large/covtype/covtype.data') print "Train with:", csvPathnameTrain parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15) dataKeyTrain = parseKeyTrain['destination_key'] csvPathnameTest = h2o.find_dataset( 'UCI/UCI-large/covtype/covtype.data') print "Test with:", csvPathnameTest parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15) dataKeyTest = parseKeyTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = { 'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0 } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # new web page for predict? throw it in here for now start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_rfview_score(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathnameTrain = h2o.find_file('smalldata/covtype/covtype.20k.data') print "Train with:", csvPathnameTrain parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.20k.hex", timeoutSecs=10) dataKeyTrain = parseKeyTrain['destination_key'] csvPathnameTest = h2o.find_dataset( 'UCI/UCI-large/covtype/covtype.data') print "Test with:", csvPathnameTest parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=10) dataKeyTest = parseKeyTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = { 'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0 } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10) rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # no_confusion_matrix=1& # clear_confusion_matrix=1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 1 kwargs['clear_confusion_matrix'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 1 kwargs['clear_confusion_matrix'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 0 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) print "Trial #", trial, "completed"
def test_rf_change_data_key_fvec(self): importFolderPath = 'standard' csvFilenameTrain = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseResult = parseResult # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. params = {'ntrees': 2, 'destination_key': 'RF_model'} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 100 start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=360, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntrees = kwargs['ntrees'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntrees, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntrees, timeoutSecs, out_of_bag_error_estimate=1, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) # FIX! should update this expected classification error # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_covtype20x(self): importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvFilenameTrain = 'covtype20x.data' key2 = 'covtype20x.data.A.hex' parseKeyTrain = h2i.parseImportFolderFile(None, csvFilenameTrain, importFolderPath, key2=key2, timeoutSecs=500) print csvFilenameTrain, 'parse time:', parseKeyTrain['response']['time'] inspect = h2o_cmd.runInspect(key=parseKeyTrain['destination_key']) dataKeyTrain = parseKeyTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvFilenameTest = 'covtype20x.data' key2 = 'covtype20x.data.B.hex' parseKeyTest = h2i.parseImportFolderFile(None, csvFilenameTest, importFolderPath, key2=key2, timeoutSecs=500) print csvFilenameTest, 'parse time:', parseKeyTest['response']['time'] print "Parse result['destination_key']:", parseKeyTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseKeyTest['destination_key']) dataKeyTest = parseKeyTest['destination_key'] dataKeyTest2 = 'covtype20x.data.C.hex' print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest resultExec = h2o_cmd.runExecOnly(expression=execExpr, timeoutSecs=15) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?" params = { 'ntree': 6, 'parallel': 1, 'out_of_bag_error_estimate': 0, 'no_confusion_matrix': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5) start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=500, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_change_data_key_fvec(self): h2o.beta_features = True importFolderPath = 'standard' csvFilenameTrain = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseResult = parseResult # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. params = { 'ntrees': 6, 'destination_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntrees = kwargs['ntrees'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntrees, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntrees, timeoutSecs, out_of_bag_error_estimate=1, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) # FIX! should update this expected classification error # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_rfview_score(self): csvPathnameTrain = 'standard/covtype.data' print "Train with:", csvPathnameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathnameTrain, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTrain = parseResultTrain['destination_key'] csvPathnameTest = 'standard/covtype.data' print "Test with:", csvPathnameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathnameTest, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTest = parseResultTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = {'ntree': 13, 'out_of_bag_error_estimate': 0} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, **kwargs) # new web page for predict? throw it in here for now (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if 'sampling_strategy' in kwargs and kwargs[ 'sampling_strategy'] != 'STRATIFIED_LOCAL': check_err = True else: check_err = False if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathnameTrain = h2o.find_file('smalldata/covtype/covtype.20k.data') print "Train with:", csvPathnameTrain parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.20k.hex", timeoutSecs=10) dataKeyTrain = parseKeyTrain['destination_key'] csvPathnameTest = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "Test with:", csvPathnameTest parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=10) dataKeyTest = parseKeyTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10) rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # no_confusion_matrix=1& # clear_confusion_matrix=1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 1 kwargs['clear_confusion_matrix'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 1 kwargs['clear_confusion_matrix'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 0 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) print "Trial #", trial, "completed"