def test_GLMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') y = "1" x = range(9) x.remove(0) # 0. member ID. not used. x.remove(1) # 1 is output x = ','.join(map(str, x)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y # FIX! thresholds is used in GLMGrid. threshold is used in GLM # comma separated means use discrete values # colon separated is min/max/step # FIX! have to update other GLMGrid tests kwargs = { 'x': x, 'y': y, 'n_folds': 2, 'beta_eps': 1e-4, 'lambda': '1e-8:1e3:100', 'alpha': '0,0.5,1', 'thresholds': '0:1:0.01' } gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs) colNames = ['D','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs) h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting parse of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename, timeoutSecs=10) y = "10" x = "" # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have kwargs = { "x": x, "y": y, "case": "1", "destination_key": "gg", # better classifier it flipped? (better AUC?) "max_iter": 10, "case": -1, "case_mode": "=", "num_cross_validation_folds": 0, "lambda": "1e-8,1e-4,1e-3", "alpha": "0,0.25,0.8", "thresholds": "0.2:0.8:0.1", } start = time.time() print "\nStarting GLMGrid of", csvFilename glmGridResult = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLMGrid in", (time.time() - start), "secs (python)" h2o_glm.simpleCheckGLMGrid(self, glmGridResult, **kwargs)
def test_GLM2Grid_basic_benign(self): h2o.beta_features = True csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one y = "3" print "y:", y kwargs = { 'ignored_cols': '0,1', 'response': y, 'n_folds': 0, 'lambda': '1e-8:1e-2:100', 'alpha': '0,0.5,1', } # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" # the gridded params make it grid..just call GLM2 gg = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=120, **kwargs) # check the first in the models list. It should be the best colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ] h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_GLMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = "logreg/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") y = "1" x = range(9) x.remove(0) # 0. member ID. not used. x.remove(1) # 1 is output x = ",".join(map(str, x)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y # FIX! thresholds is used in GLMGrid. threshold is used in GLM # comma separated means use discrete values # colon separated is min/max/step # FIX! have to update other GLMGrid tests kwargs = { "x": x, "y": y, "n_folds": 2, "beta_eps": 1e-4, "lambda": "1e-8:1e3:100", "alpha": "0,0.5,1", "thresholds": "0:1:0.01", } gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs) colNames = ["D", "CAPSULE", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"] # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs) h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_GLM2grid_covtype_many(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'response': y, 'family': 'gaussian', 'n_folds': 2, 'max_iter': max_iter, 'beta_epsilon': 1e-3, 'lambda': '0,0.5,0.8', 'alpha': '0,1e-8,1e-4', } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "glmResult:", h2o.dump_json(glmResult) # assuming it doesn't complete right away, this is the first response # it differs for the last response job_key = glmResult['job_key'] grid_key = glmResult['destination_key'] jobs.append( (job_key, grid_key) ) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(4): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start # 2/GLMGridView.html?grid_key=asd # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1 for job_key, grid_key in jobs: gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key) h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs
def test_GLMGrid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 # cols 0-13. 3 is output # no member id in this one y = "3" x = range(14) x.remove(0) # 0. skipping causes coefficient of 0 when used alone x.remove(3) # 3 is output x = ','.join(map(str, x)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y kwargs = { 'x': x, 'y': y, 'n_folds': 0, 'lambda': '1e-8:1e-2:100', 'alpha': '0,0.5,1', 'thresholds': '0:1:0.01' } # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" gg = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=120, **kwargs) # check the first in the models list. It should be the best colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ] h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting parse of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=10) y = "10" # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have kwargs = { 'response': y, 'max_iter': 10, 'n_folds': 2, 'lambda': '1e-8,1e-4,1e-3', 'alpha': '0,0.25,0.8', } start = time.time() print "\nStarting GLMGrid of", csvFilename glmGridResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLMGrid in", (time.time() - start), "secs (python)" # still get zero coeffs..best model is AUC = 0.5 with intercept only. h2o_glm.simpleCheckGLMGrid(self, glmGridResult, allowZeroCoeff=True, **kwargs)
def test_GLM2Grid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one y = "3" print "y:", y kwargs = { 'ignored_cols': '0,1', 'response': y, 'n_folds': 0, 'lambda': '1e-8:1e-2:100', 'alpha': '0,0.5,1', } # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" # the gridded params make it grid..just call GLM2 gg = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=120, **kwargs) # check the first in the models list. It should be the best colNames = [ 'STR', 'OBS', 'AGMT', 'FNDX', 'HIGD', 'DEG', 'CHK', 'AGP1', 'AGMN', 'NLV', 'LIV', 'WT', 'AGLP', 'MST' ] h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting parse of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have kwargs = { 'x': x, 'y': y, 'case': '1', 'destination_key': 'gg', # better classifier it flipped? (better AUC?) 'max_iter': 10, 'case': -1, 'case_mode': '=', 'num_cross_validation_folds': 0, 'lambda': '1e-8,1e-4,1e-3', 'alpha': '0,0.25,0.8', # hardwire threshold to 0.5 because the dataset is so senstive right around threshold # otherwise, GLMGrid will pick a model with zero coefficients, if it has the best AUC # to avoid my checker complaining about all zero coefficients, force the threshold to 0.5 'thresholds': '0.5', # 'thresholds': '0.2:0.8:0.1' } start = time.time() print "\nStarting GLMGrid of", csvFilename glmGridResult = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLMGrid in", (time.time() - start), "secs (python)" # still get zero coeffs..best model is AUC = 0.5 with intercept only. h2o_glm.simpleCheckGLMGrid(self,glmGridResult, allowZeroCoeff=True,**kwargs)
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting parse of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have kwargs = { 'x': x, 'y': y, 'case': '1', # better classifier it flipped? (better AUC?) 'max_iter': 10, 'case': -1, 'case_mode': '=', 'n_folds': 2, 'lambda': '1e-8,1e-4,1e-3', 'alpha': '0,0.25,0.8', # hardwire threshold to 0.5 because the dataset is so senstive right around threshold # otherwise, GLMGrid will pick a model with zero coefficients, if it has the best AUC # to avoid my checker complaining about all zero coefficients, force the threshold to 0.5 'thresholds': '0.5', # 'thresholds': '0.2:0.8:0.1' } start = time.time() print "\nStarting GLMGrid of", csvFilename glmGridResult = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLMGrid in", (time.time() - start), "secs (python)" # still get zero coeffs..best model is AUC = 0.5 with intercept only. h2o_glm.simpleCheckGLMGrid(self,glmGridResult, allowZeroCoeff=True,**kwargs)
def test_GLMGrid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one y = "3" x = range(14) # 0 and 1 are id-like values x.remove(0) x.remove(1) x.remove(3) # 3 is output x = ','.join(map(str, x)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y kwargs = { 'x': x, 'y': y, 'n_folds': 0, 'lambda': '1e-8:1e-2:100', 'alpha': '0,0.5,1', 'thresholds': '0:1:0.01' } # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs) # check the first in the models list. It should be the best colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ] h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_GLMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') y = "1" # 0. member ID. not used. # 1 is output print "y:", y # FIX! thresholds is used in GLMGrid. threshold is used in GLM # comma separated means use discrete values # colon separated is min/max/step # FIX! have to update other GLMGrid tests kwargs = { 'ignored_cols': 0, 'response': y, 'n_folds': 2, 'lambda': '1e-8:1e3:100', 'alpha': '0,0.5,1', } # the gridded params make it grid..just call GLM2 gg = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=120, **kwargs) colNames = ['D','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs) h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_GLM2grid_covtype_many(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=20) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'response': y, 'family': 'gaussian', 'n_folds': 2, 'max_iter': max_iter, 'beta_epsilon': 1e-3, 'lambda': '0,0.5,0.8', 'alpha': '0,1e-8,1e-4', } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "glmResult:", h2o.dump_json(glmResult) # assuming it doesn't complete right away, this is the first response # it differs for the last response job_key = glmResult['job_key'] grid_key = glmResult['destination_key'] jobs.append( (job_key, grid_key) ) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(4): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start # 2/GLMGridView.html?grid_key=asd # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1 for job_key, grid_key in jobs: gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key) h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs
def test_GLM2grid_convergence_1(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 50, 'cD', 300), (100, 100, 'cE', 300), (100, 200, 'cF', 300), (100, 300, 'cG', 300), (100, 400, 'cH', 300), (100, 500, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema='put') print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'max_iter': 10, 'n_folds': 2, 'beta_epsilon': 1e-4, 'lambda': '1e-8:1e-3:1e2', 'alpha': '0,0.5,.75', } kwargs['response'] = y for i in range(2): start = time.time() # get rid of the Jstack polling glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm grid result", h2o.dump_json(glm) print 'glm #', i, 'end on', csvPathname, 'took', time.time( ) - start, 'seconds' # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison h2o_glm.simpleCheckGLMGrid(self, glm, None, **kwargs)
def test_GLM_convergence_1(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 50, 'cD', 300), (100, 100, 'cE', 300), (100, 200, 'cF', 300), (100, 300, 'cG', 300), (100, 400, 'cH', 300), (100, 500, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema='put') print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'max_iter': 10, 'weight': 1.0, 'link': 'familyDefault', 'n_folds': 2, 'beta_eps': 1e-4, 'lambda': '1e-8:1e-3:1e2', 'alpha': '0,0.5,.75', 'thresholds': '0,1,0.2' } kwargs['y'] = y emsg = None for i in range(2): start = time.time() # get rid of the Jstack polling glm = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds' # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison h2o_glm.simpleCheckGLMGrid(self, glm, None, allowFailWarning=True, **kwargs) # gets the failed to converge, here, after we see it in the browser too if emsg is not None: raise Exception(emsg)
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting parse of", csvFilename parseResult = h2i.import_parse( bucket=bucket, path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=20 ) y = "10" # NOTE: hastie has two values, -1 and 1. To make H2O work if two valued and not 0,1 have kwargs = {"response": y, "max_iter": 10, "n_folds": 2, "lambda": "1e-8,1e-4,1e-3", "alpha": "0,0.25,0.8"} start = time.time() print "\nStarting GLMGrid of", csvFilename glmGridResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLMGrid in", (time.time() - start), "secs (python)" # still get zero coeffs..best model is AUC = 0.5 with intercept only. h2o_glm.simpleCheckGLMGrid(self, glmGridResult, allowZeroCoeff=True, **kwargs)
def test_GLM2grid_convergence_1(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 50, "cD", 300), (100, 100, "cE", 300), (100, 200, "cF", 300), (100, 300, "cG", 300), (100, 400, "cH", 300), (100, 500, "cI", 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%sx%s.csv" % (SEEDPERFILE, rowCount, colCount) csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, timeoutSecs=10, schema="put") print "Parse result['destination_key']:", parseResult["destination_key"] inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvFilename y = colCount kwargs = { "max_iter": 10, "n_folds": 2, "beta_epsilon": 1e-4, "lambda": "1e-8:1e-3:1e2", "alpha": "0,0.5,.75", } kwargs["response"] = y for i in range(2): start = time.time() # get rid of the Jstack polling glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm grid result", h2o.dump_json(glm) print "glm #", i, "end on", csvPathname, "took", time.time() - start, "seconds" # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison h2o_glm.simpleCheckGLMGrid(self, glm, None, **kwargs)
def test_GLMGrid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 # cols 0-13. 3 is output # no member id in this one y = "3" x = range(14) # 0 and 1 are id-like values x.remove(0) x.remove(1) x.remove(3) # 3 is output x = ','.join(map(str, x)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y kwargs = { 'x': x, 'y': y, 'n_folds': 0, 'lambda': '1e-8:1e-2:100', 'alpha': '0,0.5,1', 'thresholds': '0:1:0.01' } # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs) # check the first in the models list. It should be the best colNames = [ 'STR', 'OBS', 'AGMT', 'FNDX', 'HIGD', 'DEG', 'CHK', 'AGP1', 'AGMN', 'NLV', 'LIV', 'WT', 'AGLP', 'MST' ] h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_GLMGrid_basic_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = 'logreg/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') y = "1" x = range(9) x.remove(0) # 0. member ID. not used. x.remove(1) # 1 is output x = ','.join(map(str, x)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y # FIX! thresholds is used in GLMGrid. threshold is used in GLM # comma separated means use discrete values # colon separated is min/max/step # FIX! have to update other GLMGrid tests kwargs = { 'x': x, 'y': y, 'n_folds': 2, 'beta_eps': 1e-4, 'lambda': '1e-8:1e3:100', 'alpha': '0,0.5,1', 'thresholds': '0:1:0.01' } gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs) colNames = [ 'D', 'CAPSULE', 'AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA', 'VOL', 'GLEASON' ] # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs) h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_GLMGrid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = "logreg/" + csvFilename parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=csvFilename + ".hex", schema="put") # columns start at 0 # cols 0-13. 3 is output # no member id in this one y = "3" x = range(14) x.remove(0) # 0. skipping causes coefficient of 0 when used alone x.remove(3) # 3 is output x = ",".join(map(str, x)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y kwargs = {"x": x, "y": y, "n_folds": 0, "lambda": "1e-8:1e-2:100", "alpha": "0,0.5,1", "thresholds": "0:1:0.01"} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" gg = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=120, **kwargs) # check the first in the models list. It should be the best colNames = [ "STR", "OBS", "AGMT", "FNDX", "HIGD", "DEG", "CHK", "AGP1", "AGMN", "NLV", "LIV", "WT", "AGLP", "MST", ] h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_B_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 # cols 0-13. 3 is output # no member id in this one y = "3" xList = [] for appendx in xrange(14): if (appendx == 0): print "\nSkipping 0. Causes coefficient of 0 when used alone" elif (appendx == 3): print "\n3 is output." else: xList.append(appendx) x = ','.join(map(str, xList)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y kwargs = { 'x': x, 'y': y, 'num_cross_validation_folds': 0, 'lambda': '1e-8:1e-2:100', 'alpha': '0,0.5,1', 'thresholds': '0:1:0.01' } # fails with num_cross_validation_folds print "Not doing num_cross_validation_folds with benign. Fails with 'unable to solve?'" gg = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=120, **kwargs) # check the first in the models list. It should be the best colNames = [ 'STR','OBS','AGMT','FNDX','HIGD','DEG','CHK', 'AGP1','AGMN','NLV','LIV','WT','AGLP','MST' ] # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[-1]], **kwargs) h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_GLMGrid_basic_benign(self): csvFilename = "benign.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 # cols 0-13. 3 is output # no member id in this one y = "3" x = range(14) x.remove(0) # 0. skipping causes coefficient of 0 when used alone x.remove(3) # 3 is output x = ','.join(map(str, x)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y kwargs = { 'x': x, 'y': y, 'n_folds': 0, 'lambda': '1e-8:1e-2:100', 'alpha': '0,0.5,1', 'thresholds': '0:1:0.01' } # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" gg = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=120, **kwargs) # check the first in the models list. It should be the best colNames = [ 'STR', 'OBS', 'AGMT', 'FNDX', 'HIGD', 'DEG', 'CHK', 'AGP1', 'AGMN', 'NLV', 'LIV', 'WT', 'AGLP', 'MST' ] h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_C_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename # columns start at 0 csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") y = "1" xList = [] for appendx in xrange(9): if (appendx == 0): print "\n0 is member ID. not used" elif (appendx == 1): print "\n1 is output." else: xList.append(appendx) x = ','.join(map(str, xList)) # just run the test with all x, not the intermediate results print "\nx:", x print "y:", y # FIX! thresholds is used in GLMGrid. threshold is used in GLM # comma separated means use discrete values # colon separated is min/max/step # FIX! have to update other GLMGrid tests kwargs = { 'x': x, 'y': y, 'num_cross_validation_folds': 2, 'beta_epsilon': 1e-4, 'lambda': '1e-8:1e3:100', 'alpha': '0,0.5,1', 'thresholds': '0:1:0.01' } gg = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=120, **kwargs) colNames = ['D','CAPSULE','AGE','RACE','DPROS','DCAPS','PSA','VOL','GLEASON'] # h2o_glm.simpleCheckGLMGrid(self, gg, colNames[xList[0]], **kwargs) h2o_glm.simpleCheckGLMGrid(self, gg, None, **kwargs)
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = False DO_GLMGRID = True USE_HOME2 = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack', 'iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 if USE_HOME2: csvFilenameList = [ # this should hit the "more" files too? ("00[0-4][0-9]_syn.csv.gz", "file_50.dat.gz", 50 * synSize, 700 ), ("[0][1][0-9][0-9]_.*", "file_100.dat.gz", 100 * synSize, 700), ("[0][0-4][0-9][0-9]_.*", "file_500.dat.gz", 500 * synSize, 700), ("[0][0-9][0-9][0-9]_.*", "file_1000.dat.gz", 1000 * synSize, 700), # ("10k_small_gz/[0-4][0-9][0-9][0-9]_.*", "file_5000.dat.gz", 5000 * synSize , 700), # ("10k_small_gz/[0-9][0-9][0-9][0-9]_.*", "file_10000.dat.gz", 10000 * synSize , 700), ] else: csvFilenameList = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz"), # 100 files takes too long on two machines? # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("[A]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize / 2), 7200), ("[A-B]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_B_400_x55.dat.gz", 400 * (avgMichalSize / 2), 7200), ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_C_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200), ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_D_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200), ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_E_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200), ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_F_800_x55.dat.gz", 800 * (avgMichalSize / 2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz if USE_HOME2: bucket = "home2-0xdiag-datasets/1k_small_gz" else: bucket = "home-0xdiag-datasets" if USE_S3: URI = "s3://" + bucket protocol = "s3" else: URI = "s3n://" + bucket protocol = "s3n/hdfs" # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [28]: print "\n", tryHeap, "GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o_hosts.build_cloud_with_hosts( h2oPerNode, java_heap_GB=tryHeap, # java_extra_args=jea, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10, # all hdfs info is done thru the hdfs_config michal's ec2 config sets up? # this is for our amazon ec hdfs # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n hdfs_name_node='10.78.14.235:9000', hdfs_version='0.20.2') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: importResult = h2o.nodes[0].import_s3(bucket) else: importResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importResult['succeeded'] for k in s3nFullList: key = k['key'] # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in key: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", key break else: ### print key pass ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(len(s3nFullList), 8, "Didn't see more than 8 files in s3n?") s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 start = time.time() parseKey = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i + 1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i + 2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse( s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print s3nKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs( pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378 ]: x.remove(i) x = ",".join(map(str, x)) if DO_GLM: algo = 'GLM' GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 2, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLMOnly( parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGridOnly( parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.check_key_distribution() h2o_cmd.delete_csv_key(csvFilename, s3nFullList) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_GLMGrid_covtype_many(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 2, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_eps': 1e-3, 'lambda': '0,0.5,0.8', 'alpha': '0,1e-8,1e-4', 'parallelism': 1, } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): GLMResult = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "GLMResult:", h2o.dump_json(GLMResult) job_key = GLMResult['response']['redirect_request_args']['job'] model_key = GLMResult['response']['redirect_request_args']['destination_key'] jobs.append( (job_key, model_key) ) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(10): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start for job_key, model_key in jobs: GLMResult = h2o.nodes[0].GLMGrid_view(job=job_key, destination_key=model_key) h2o_glm.simpleCheckGLMGrid(self, GLMResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs
def test_GLM_convergence_1(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 50, 'cD', 300), (100, 100, 'cE', 300), (100, 200, 'cF', 300), (100, 300, 'cG', 300), (100, 400, 'cH', 300), (100, 500, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) USEKNOWNFAILURE = True for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) if USEKNOWNFAILURE: csvFilename = 'failtoconverge_100x50.csv' csvPathname = h2o.find_file('smalldata/logreg/' + csvFilename) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'max_iter': 10, 'weight': 1.0, 'link': 'familyDefault', 'n_folds': 2, 'beta_epsilon': 1e-4, #*********** 'lambda': '1e-8:1e-3:1e2', 'alpha': '0,0.5,.75', 'thresholds': '0,1,0.2' } if USEKNOWNFAILURE: kwargs['y'] = 50 else: kwargs['y'] = y emsg = None for i in range(25): start = time.time() glm = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noise=("Jstack", None), **kwargs) print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds' # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison warnings = h2o_glm.simpleCheckGLMGrid(self, glm, None, allowFailWarning=True, **kwargs) # gets the failed to converge, here, after we see it in the browser too x = re.compile("[Ff]ailed") if warnings: for w in warnings: if (re.search(x,w)): # first if emsg is None: emsg = w print w if emsg: break if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLMGridProgress") time.sleep(5) # gets the failed to converge, here, after we see it in the browser too if emsg is not None: raise Exception(emsg)
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = True DO_GLMGRID = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack','iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 csvFilenameList = [ (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), (["manyfiles-nflx-gz"], "*file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), (["manyfiles-nflx-gz"], "*file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), (["manyfiles-nflx-gz"], "*file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use (["A-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): bucket = "home-0xdiag-datasets" ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [28]: if USE_S3: protocol = "s3" else: protocol = "s3n" print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" # jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o.init(h2oPerNode, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10) # java_extra_args=jea, # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandboxIgnoreErrors = True for trial in range(trialMax): # import a list of folders, one at a time (hdfs import can't take pattern match # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket # too slow for csvFolder in csvFolderList: # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='s3') else: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='hdfs') foundKeys = 0 for s in importResult['succeeded']: # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in s['key']: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", s['key'] break else: pass foundKeys += 1 ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(foundKeys,8,"Didn't see more than 8 files in s3n?") src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse2Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] src_key = URI + csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse3Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) y = 378 if not noPoll: x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y]: x.remove(i) x = ",".join(map(str,x)) if DO_GLM: algo = 'GLM' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.checkKeyDistribution() h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = True DO_GLMGRID = False USE_HOME2 = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack','iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 if USE_HOME2: csvFilenameList = [ # this should hit the "more" files too? ("00[0-4][0-9]_syn.csv.gz", "file_50.dat.gz", 50 * synSize , 700), ("[0][1][0-9][0-9]_.*", "file_100.dat.gz", 100 * synSize , 700), ("[0][0-4][0-9][0-9]_.*", "file_500.dat.gz", 500 * synSize , 700), ("[0][0-9][0-9][0-9]_.*", "file_1000.dat.gz", 1000 * synSize , 700), # ("10k_small_gz/[0-4][0-9][0-9][0-9]_.*", "file_5000.dat.gz", 5000 * synSize , 700), # ("10k_small_gz/[0-9][0-9][0-9][0-9]_.*", "file_10000.dat.gz", 10000 * synSize , 700), ] else: csvFilenameList = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz"), # 100 files takes too long on two machines? # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), # ("manyfiles-nflx-gz/file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), # ("manyfiles-nflx-gz/file_[1-2][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), # ("manyfiles-nflx-gz/file_[1-2][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), # ("manyfiles-nflx-gz/file_[1-2][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), # ("manyfiles-nflx-gz/file_[1-2][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), # ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("manyfiles-nflx-gz/file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("[A]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200), ("[A-B]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_B_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200), ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_C_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_D_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_E_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ("[A-D]-800-manyfiles-nflx-gz/file_[0-9]*.dat.gz", "file_F_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz if USE_HOME2: bucket = "home2-0xdiag-datasets/1k_small_gz" else: bucket = "home-0xdiag-datasets" if USE_S3: URI = "s3://" + bucket protocol = "s3" else: URI = "s3n://" + bucket protocol = "s3n/hdfs" # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [14]: print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o_hosts.build_cloud_with_hosts(h2oPerNode, java_heap_GB=tryHeap, # java_extra_args=jea, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10, # all hdfs info is done thru the hdfs_config michal's ec2 config sets up? # this is for our amazon ec hdfs # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n hdfs_name_node='10.78.14.235:9000', hdfs_version='0.20.2') # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandbox_ignore_errors = True for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: importResult = h2o.nodes[0].import_s3(bucket) else: importResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importResult['succeeded'] for k in s3nFullList: key = k['key'] # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in key: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", key break else: ### print key pass ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(len(s3nFullList),8,"Didn't see more than 8 files in s3n?") s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse2Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] s3nKey = URI + "/" + csvFilepattern key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", s3nKey, "to", key2 parse3Key = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print s3nKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.check_enums_from_inspect(parseKey) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) x = ",".join(map(str,x)) if DO_GLM: algo = 'GLM' GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.check_key_distribution() h2o_cmd.delete_csv_key(csvFilename, s3nFullList) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def test_GLM_convergence_1(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 50, 'cD', 300), (100, 100, 'cE', 300), (100, 200, 'cF', 300), (100, 300, 'cG', 300), (100, 400, 'cH', 300), (100, 500, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) USEKNOWNFAILURE = True for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE,rowCount,colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) if USEKNOWNFAILURE: csvFilename = 'failtoconverge_100x50.csv' csvPathname = h2o.find_file('smalldata/logreg/' + csvFilename) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'max_iter': 10, 'weight': 1.0, 'link': 'familyDefault', 'n_folds': 2, 'beta_epsilon': 1e-4, 'lambda': '1e-8:1e-3:1e2', 'alpha': '0,0.5,.75', 'thresholds': '0,1,0.2' } if USEKNOWNFAILURE: kwargs['y'] = 50 else: kwargs['y'] = y emsg = None for i in range(2): start = time.time() # get rid of the Jstack polling glm = h2o_cmd.runGLMGridOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print 'glm #', i, 'end on', csvPathname, 'took', time.time() - start, 'seconds' # we can pass the warning, without stopping in the test, so we can # redo it in the browser for comparison warnings = h2o_glm.simpleCheckGLMGrid(self, glm, None, allowFailWarning=True, **kwargs) # gets the failed to converge, here, after we see it in the browser too x = re.compile("[Ff]ailed") if warnings: for w in warnings: if (re.search(x,w)): # first if emsg is None: emsg = w print w if emsg: break if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLMGridProgress") time.sleep(5) # gets the failed to converge, here, after we see it in the browser too if emsg is not None: raise Exception(emsg)
def test_GLMGrid_covtype_many(self): csvFilename = "covtype.data" csvPathname = "UCI/UCI-large/covtype/" + csvFilename parseResult = h2i.import_parse(bucket="datasets", path=csvPathname, schema="put", timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvPathname, " num_rows:", "{:,}".format(inspect["num_rows"]), " num_cols:", "{:,}".format( inspect["num_cols"] ) x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { "x": x, "y": y, "family": "binomial", "link": "logit", "n_folds": 2, "case_mode": "=", "case": 1, "max_iter": max_iter, "beta_eps": 1e-3, "lambda": "0,0.5,0.8", "alpha": "0,1e-8,1e-4", "parallel": 1, } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): GLMResult = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "GLMResult:", h2o.dump_json(GLMResult) job_key = GLMResult["response"]["redirect_request_args"]["job"] model_key = GLMResult["response"]["redirect_request_args"]["destination_key"] jobs.append((job_key, model_key)) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(10): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse( bucket="datasets", path=csvPathname, schema="put", src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False, ) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start for job_key, model_key in jobs: GLMResult = h2o.nodes[0].GLMGrid_view(job=job_key, destination_key=model_key) h2o_glm.simpleCheckGLMGrid(self, GLMResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs
def test_GLMGrid_covtype_many(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'n_folds': 2, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_eps': 1e-3, 'lambda': '0,0.5,0.8', 'alpha': '0,1e-8,1e-4', 'parallelism': 1, } start = time.time() jobs = [] totalGLMGridJobs = 0 for i in range(3): GLMResult = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=300, noPoll=True, **kwargs) # print "GLMResult:", h2o.dump_json(GLMResult) job_key = GLMResult['response']['redirect_request_args']['job'] model_key = GLMResult['response']['redirect_request_args'][ 'destination_key'] jobs.append((job_key, model_key)) totalGLMGridJobs += 1 # do some parse work in parallel. Don't poll for parse completion # don't bother checking the parses when they are completed (pollWaitJobs looks at all) for i in range(10): time.sleep(3) hex_key = str(i) + ".hex" src_key = str(i) + ".src" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key, timeoutSecs=10, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=300) elapsed = time.time() - start for job_key, model_key in jobs: GLMResult = h2o.nodes[0].GLMGrid_view(job=job_key, destination_key=model_key) h2o_glm.simpleCheckGLMGrid(self, GLMResult, **kwargs) print "All GLMGrid jobs completed in", elapsed, "seconds." print "totalGLMGridJobs:", totalGLMGridJobs