def test_GLM_gamma_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 24 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) kmeans_doit(self, csvFilename, csvPathname, num_rows=1000000, timeoutSecs=60) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) kmeans_doit(self, filename2xShuf, pathname2xShuf, num_rows=2000000, timeoutSecs=90) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) kmeans_doit(self, filename4x, pathname4x, num_rows=4000000, timeoutSecs=120)
def test_rf_covtype_train_full(self): csvFilename = 'train.csv' csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename) print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=1, timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() rfView = h2o_cmd.runRF(csvPathname=csvPathname, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix'][ 'classification_error'] self.assertLess( classification_error, 0.02, "train.csv should have full classification error <0.02") print "Trial #", trial, "completed"
def test_putfile_a5m(self): timeoutSecs = 500 csvFilenameList = [ # use different names for each parse # doesn't fail if gzipped? ("a5m.csv", 'A', None), ("a5m.csv", 'B', None), ("a5m.csv", 'C', None), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, key, trees) in csvFilenameList: csvPathname = h2o.find_dataset(csvFilename) # creates csvFilename and csvFilename.hex keys parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ( (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15) * (kwargs['parallel'] and 1 or 3)) start = time.time() h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs)
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 10 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'num_cross_validation_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'num_cross_validation_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=150, parseKey=parseKey, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10) glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_rf_strata_fail(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') timeoutSecs = 60 kwargs = { 'response_variable': 54, 'ntree': 50, 'features': '', 'depth': 2147483647, 'stat_type': 'ENTROPY', 'ignore': '', 'class_weights': '1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0', 'sampling_strategy': 'RANDOM', 'strata_samples': 'undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined,undefined=undefined', 'sample': '67', 'out_of_bag_error_estimate': 1, 'model_key': '', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': '', 'iterative_cm': 1, 'use_non_local_data': 0, } h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} (modelKey, validations1) = glm_doit(self, csvFilename, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, pathname2x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, pathname4x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60)
def test_GLM_poisson_1(self): csvFilename = 'covtype.data' csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1 == 0): print "WARNING: just doing the first 33 features, for comparison to ??? numbers" # pythonic! x = ",".join(map(str, range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'x': x, 'y': y, 'family': 'poisson', 'link': 'log', 'n_folds': 0, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } timeoutSecs = 120 # L2 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': 'binomial', 'max_iter': 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds'] * 10 + params['max_iter'] * 10) glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*20) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_glm_covtype_single_cols(self): timeoutSecs = 10 csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "\n" + csvPathname # columns start at 0 y = "54" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) print "GLM binomial wth 1 X column at a time" print "Result check: abs. value of coefficient and intercept returned are bigger than zero" for colX in xrange(54): if x == "": x = str(colX) else: # x = x + "," + str(colX) x = str(colX) sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds'
def test_glm_covtype_single_cols(self): timeoutSecs = 10 csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "\n" + csvPathname # columns start at 0 y = "54" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) print "GLM binomial wth 1 X column at a time" print "Result check: abs. value of coefficient and intercept returned are bigger than zero" for colX in xrange(54): if x == "": x = str(colX) else: # x = x + "," + str(colX) x = str(colX) sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self, csvFilename, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) glm_doit(self, filename4x, pathname4x, timeoutSecs=120)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self,csvFilename, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, pathname2x, timeoutSecs=45) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self,filename4x, pathname4x, timeoutSecs=60)
def test_exec_filter_slice2(self): timeoutSecs = 10 csvFilename = "covtype.data" csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') key2 = "c" parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c', 10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) for trial in range(10): print "Doing the execs in order, to feed filters into slices" nodeX = 0 for exprTemplate in exprList: execExpr = h2e.fill_in_expr_template(exprTemplate, colX=0, n=0, row=1, key2=key2, m=2) time.sleep(2) h2o.check_sandbox_for_errors() execResultInspect, min_value = h2e.exec_expr( h2o.nodes[nodeX], execExpr, resultKey="Result.hex", timeoutSecs=4) print "min_value:", min_value, "execExpr:", execExpr h2o.verboseprint("min: ", min_value, "trial:", trial)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self,csvFilename, csvPathname, timeoutSecs=75) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, pathname2x, timeoutSecs=75) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self,filename4x, pathname4x, timeoutSecs=150)
def test_rf_params_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') kwargs = { 'response_variable': 54, 'features': 7, 'sampling_strategy': 'STRATIFIED_LOCAL', 'out_of_bag_error_estimate': 1, 'strata_samples': '1=10,2=99,3=99,4=99,5=99,6=99,7=99', 'bin_limit': None, 'seed': '11111', 'model_key': '012345', 'ntree': 13, 'parallel': 1 } for trial in range(2): # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) start = time.time() rfv = h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start cm = rfv['confusion_matrix'] classification_error = cm['classification_error'] rows_skipped = cm['rows_skipped'] # just want to catch the nan case when all rows are skipped self.assertLess(rows_skipped, 581012) self.assertLess(classification_error, 100) # error if nan print "Trial #", trial, "completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_covtype_train_oobe(self): if (1 == 0): csvFilename = 'train.csv' csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename) print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=1, timeoutSecs=180) # FIX! maybe try specifying column header with column name ### kwargs['response_variable'] = A55 else: csvFilename = 'covtype.data' print "\nUsing header=0 on the normal covtype.data" csvPathname = h2o.find_dataset( 'UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix'][ 'classification_error'] self.assertGreater( classification_error, 0.01, "train.csv should have out of bag error estimate greater than 0.01" ) print "Trial #", trial, "completed"
def test_rand_inspect(self): ### h2b.browseTheCloud() csvFilename = 'covtype.data' csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename) print "\n" + csvPathname parseKey = h2o_cmd.parseFile(None, csvPathname, key=csvFilename, timeoutSecs=10) destination_key = parseKey['destination_key'] print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", destination_key def inspect_and_check(nodeX, destination_key, offset, view, inspect=None): inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX], destination_key, offset=offset, view=view) # FIX! get min/max/mean/variance for a col too? constantNames = [ 'num_cols', 'num_rows', ] if inspect is not None: for i in constantNames: self.assertEqual(inspect[i], inspectNew[i]) return inspectNew # going to use this to compare against future. num_rows/num_cols should always # be the same, regardless of the view. just a coarse sanity check origInspect = inspect_and_check(0, destination_key, 0, 1) h2o.verboseprint(h2o.dump_json(origInspect)) num_rows = origInspect['num_rows'] num_cols = origInspect['num_cols'] lenNodes = len(h2o.nodes) for i in range(1000): # we want to use the boundary conditions, so have two level of random choices offset = good_choices(num_rows) view = good_choices(num_cols) # randomize the node used nodeX = random.randint(0, lenNodes - 1) print "nodeX:", nodeX, "offset:", offset, "view:", view inspect_and_check(nodeX, destination_key, offset, view, origInspect) # do it again, once in a while r = random.randint(0, 10) if (r == 0): inspect_and_check(nodeX, destination_key, offset, view, origInspect)
def test_poker_xlsx(self): # maybe can get stuck during polling for parse progress? # break it out for pollTimeoutSecs parseKey = h2o_cmd.parseFile( None, h2o.find_dataset('poker/poker-hand-testing.xlsx'), timeoutSecs=120, pollTimeoutSecs=60) h2o_cmd.runRFOnly(None, parseKey=parseKey, timeoutSecs=120)
def test_rf_params_rand2_7066883810153380318(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 23, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_GLM_covtype(self): csvFilename = 'covtype.data' csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1==0): print "WARNING: just doing the first 33 features, for comparison to allstate numbers" # pythonic! x = ",".join(map(str,range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" # L2 args = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'num_cross_validation_folds': 0, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_eps': 1e-3} timeoutSecs = 120 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
def test_loop_random_exec_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 15) print "\nParse key is:", parseKey['destination_key'] h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=5) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_rf_params_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) start = time.time() h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_covtype_train_oobe(self): if (1==0): csvFilename = 'train.csv' csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename) print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=1, timeoutSecs=180) # FIX! maybe try specifying column header with column name ### kwargs['response_variable'] = A55 else: csvFilename = 'covtype.data' print "\nUsing header=0 on the normal covtype.data" csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2o_cmd.info_from_inspect(inspect, csvPathname) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix']['classification_error'] self.assertGreater(classification_error, 0.01, "train.csv should have out of bag error estimate greater than 0.01") print "Trial #", trial, "completed"
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self, csvFilename, csvPathname, timeoutSecs=300) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) glm_doit(self, filename2x, pathname2x, timeoutSecs=300)
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'n_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self,csvFilename, csvPathname, timeoutSecs=300) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, pathname2x, timeoutSecs=300)
def test_rand_inspect(self): ### h2b.browseTheCloud() csvFilename = 'covtype.data' csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/'+ csvFilename) print "\n" + csvPathname parseKey = h2o_cmd.parseFile(None, csvPathname, key=csvFilename, timeoutSecs=10) destination_key = parseKey['destination_key'] print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", destination_key def inspect_and_check(nodeX,destination_key,offset,view,inspect=None): inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX], destination_key, offset=offset, view=view) # FIX! get min/max/mean/variance for a col too? constantNames = [ 'num_cols', 'num_rows', ] if inspect is not None: for i in constantNames: self.assertEqual(inspect[i], inspectNew[i]) return inspectNew # going to use this to compare against future. num_rows/num_cols should always # be the same, regardless of the view. just a coarse sanity check origInspect = inspect_and_check(0,destination_key,0,1) h2o.verboseprint(h2o.dump_json(origInspect)) num_rows = origInspect['num_rows'] num_cols = origInspect['num_cols'] lenNodes = len(h2o.nodes) for i in range (1000): # we want to use the boundary conditions, so have two level of random choices offset = good_choices(num_rows) view = good_choices(num_cols) # randomize the node used nodeX = random.randint(0,lenNodes-1) print "nodeX:", nodeX, "offset:", offset, "view:", view inspect_and_check(nodeX,destination_key,offset,view,origInspect) # do it again, once in a while r = random.randint(0,10) if (r==0): inspect_and_check(nodeX,destination_key,offset,view,origInspect)
def test_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2o.find_dataset('UCI/UCI-large/covtype' + '/' + filename1x) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, key2, timeoutSecs, resultMult) in csvAll: parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=2000) print "Parse result['Key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols( lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') # for determinism, I guess we should spit out the seed? ##### SEED = random.randint(0, sys.maxint) # if you have to force to redo a test SEED = 4201285065147091758 random.seed(SEED) print "\nUsing random seed:", SEED for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_B_putfile_files(self): timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameList = [ ("covtype.data", 'UCI/UCI-large/covtype/covtype.data', 1), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, datasetPath, trees) in csvFilenameList: csvPathname = h2o.find_dataset(datasetPath) # creates csvFilename and csvFilename.hex keys node = h2o.nodes[0] key = node.put_file(csvPathname, key=csvFilename, timeoutSecs=timeoutSecs) # not using parseFile...used to be a bug if we inspect the file we just put # so we test that inspect1 = h2o_cmd.runInspect(key=csvFilename) parseKey = node.parse(key, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect2 = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees, depth=25, parseKey=parseKey, timeoutSecs=timeoutSecs) sys.stdout.write('.') sys.stdout.flush()
def test_rf_params_rand2_7066883810153380318(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') # for determinism, I guess we should spit out the seed? # random.seed(SEED) # SEED = random.randint(0, sys.maxint) # if you have to force to redo a test SEED = 7066883810153380318 random.seed(SEED) print "\nUsing random seed:", SEED for trial in range(10): # params is mutable. This is default. params = {'ntree': 23, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand2_7066883810153380318(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') # for determinism, I guess we should spit out the seed? # random.seed(SEED) # SEED = random.randint(0, sys.maxint) # if you have to force to redo a test SEED = 7066883810153380318 random.seed(SEED) print "\nUsing random seed:", SEED for trial in range(20): # params is mutable. This is default. params = {'ntree': 23, 'parallel': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 3) h2o_cmd.runRF(timeoutSecs=timeoutSecs, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_loop_random_exec_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 15) print "\nParse key is:", parseKey['destination_key'] h2b.browseTheCloud() h2e.exec_zero_list(zeroList) start = time.time() h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'c.hex', maxCol=54, maxRow=400000, maxTrials=200, timeoutSecs=15) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", 'took', time.time( ) - start, 'seconds'
def test_loop_random_param_covtype(self): start = time.time() csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) print "upload/parse end on ", csvPathname, 'took', time.time() - start, 'seconds' kwargs = define_params() for trial in range(3): # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*20) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_rf_params_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ( (kwargs['ntree'] * 20) * max(1, kwargs['features'] / 15) * (kwargs['parallel'] and 1 or 3)) start = time.time() h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs)
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(20): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10) h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) print "Trial #", trial, "completed"
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') for trial in range(10): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'features': 7} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 30 + ((kwargs['ntree']*20) * max(1,kwargs['features']/15) * (kwargs['parallel'] and 1 or 3)) start = time.time() h2o_cmd.runRF(timeoutSecs=timeoutSecs, retryDelaySecs=1, csvPathname=csvPathname, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_exec_covtype_cols(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 10) print "\nParse key is:", parseKey['destination_key'] ### h2b.browseTheCloud() start = time.time() # passes with suffix, fails without? # suffix = "" suffix = ".hex" print "Using .hex suffix everywhere until we get type checking in H2O.." + \ "Fails with first size=1 otherwise" for i in range(54): execExpr = "Result" + str(i) + suffix + " = c.hex[" + str(i) + "]" print "execExpr:", execExpr h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result" + str(i) + suffix, timeoutSecs=4) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
def test_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2o.find_dataset('UCI/UCI-large/covtype' + '/' + filename1x) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, key2, timeoutSecs, resultMult) in csvAll: parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=2000) print "Parse result['Key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) paramDict = define_params() print "\nUsing random seed:", SEED for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'num_cross_validation_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 30 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['num_cross_validation_folds']*20) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_exec_filter_slice(self): timeoutSecs = 10 csvFilename = "covtype.data" csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') key2 = "c" parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c', 10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['desination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) for trial in range(10): print "Doing the execs in order, to feed filters into slices" nodeX = 0 for exprTemplate in exprList: execExpr = h2e.fill_in_expr_template(exprTemplate, colX=0, n=0, row=1, key2=key2, m=2) execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr, resultKey="Result.hex", timeoutSecs=4) print "min_value:", min_value, "execExpr:", execExpr h2o.verboseprint("min: ", min_value, "trial:", trial)
def test_putfile_a5m(self): timeoutSecs = 500 csvFilenameList = [ # use different names for each parse # doesn't fail if gzipped? ("a5m.csv", 'A', None), ("a5m.csv", 'B', None), ("a5m.csv", 'C', None), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, key, trees) in csvFilenameList: csvPathname = h2o.find_dataset(csvFilename) # creates csvFilename and csvFilename.hex keys parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees, depth=25, parseKey=parseKey, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_gamma_fail1(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) for trial in range(5): kwargs = { 'standardize': 1, 'family': 'gamma', 'link': 'familyDefault', 'y': 54, 'lambda': 0.0001, 'alpha': 0.5, 'max_iter': 25, 'n_folds': 1, } start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter # h2o_glm.simpleCheckGLM(self, glm, None, maxExpectedIterations=kwargs['max_iter']-2, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, None, **kwargs) print "Trial #", trial, "completed\n"
def test_B_putfile_files(self): timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameList = [ ("covtype.data", 'UCI/UCI-large/covtype/covtype.data', 1), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, datasetPath, trees) in csvFilenameList: csvPathname = h2o.find_dataset(datasetPath) # creates csvFilename and csvFilename.hex keys node = h2o.nodes[0] key = node.put_file(csvPathname, key=csvFilename, timeoutSecs=timeoutSecs) # not using parseFile...used to be a bug if we inspect the file we just put # so we test that inspect1 = h2o_cmd.runInspect(key=csvFilename) parseKey = node.parse(key, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect2 = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) sys.stdout.write('.') sys.stdout.flush()
def test_exec_covtype_cols(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', 'c.hex', 10) print "\nParse key is:", parseKey['destination_key'] ### h2b.browseTheCloud() start = time.time() # passes with suffix, fails without? # suffix = "" suffix = ".hex" print "Using .hex suffix everywhere until we get type checking in H2O.." + \ "Fails with first size=1 otherwise" for i in range(54): execExpr = "Result" + str(i) + suffix + " = c.hex[" + str(i) + "]" print "execExpr:", execExpr h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result" + str(i) + suffix, timeoutSecs=4) h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", 'took', time.time( ) - start, 'seconds'
def test_rf_covtype_train_full(self): csvFilename = 'train.csv' csvPathname = h2o.find_dataset('bench/covtype/h2o/' + csvFilename) print "\nUsing header=1 even though I shouldn't have to. Otherwise I get NA in first row and RF bad\n" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", header=1, timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() rfView = h2o_cmd.runRF(csvPathname=csvPathname, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) classification_error = rfView['confusion_matrix']['classification_error'] self.assertLess(classification_error, 0.02, "train.csv should have full classification error <0.02") print "Trial #", trial, "completed"
def test_G_RF_covtype(self): h2o_cmd.runRF( trees=6, timeoutSecs=35, retryDelaySecs=0.5, csvPathname=h2o.find_dataset('UCI/UCI-large/covtype/covtype.data'))
def test_loop_random_exec_covtype(self): lenNodes = len(h2o.nodes) csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') key2 = 'c.hex' parseKey = h2o_cmd.parseFile(None, csvPathname, 'covtype.data', key2, 10) print "\nParse key is:", parseKey['destination_key'] h2b.browseTheCloud() # for trial in range(53): trial = 0 while (trial < 100): for exprTemplate in exprList: trial = trial + 1 n = trial colX = random.randint(1, 54) row = random.randint(1, 400000) execExpr = exprTemplate execExpr = re.sub('<col1>', str(colX), execExpr) execExpr = re.sub('<col2>', str(colX + 1), execExpr) execExpr = re.sub('<n>', str(n), execExpr) execExpr = re.sub('<row>', str(row), execExpr) execExpr = re.sub('<keyX>', str(key2), execExpr) # pick a random node to execute it on randNode = random.randint(0, lenNodes - 1) print "\nexecExpr:", execExpr, "on node", randNode start = time.time() resultExec = h2o_cmd.runExecOnly(node=h2o.nodes[randNode], expression=execExpr, timeoutSecs=15) h2o.verboseprint(h2o.dump_json(resultExec)) # print(h2o.dump_json(resultExec)) # FIX! race conditions. If json is done, does that mean you can inspect it?? # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist if trial > 1: inspectMe = random.choice(inspectList) resultInspect = h2o.nodes[0].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[1].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[2].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) # FIX! if we race the browser doing the exec too..it shouldn't be a problem? # might be a bug? # WARNING! we can't browse the Exec url history, since that will # cause the Exec to execute again thru the browser..i.e. it has side effects # just look at the last inspect, which should be the resultInspect! # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2b.browseJsonHistoryAsUrlLastMatch("Exec") # url = "http://192.168.0.37:54321/Exec?Expr=Result3+%3D+c.hex%5B26%5D+%2B+Result1&Key=Result" # webbrowser.open_new_tab(url) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data", 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"