def test_many_cols_and_values_with_syn(self): SEED = random.randint(0, sys.maxint) print "\nUsing random seed:", SEED # SEED = random.seed(SEED) SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 10, 'cA', 5), (100, 1000, 'cB', 5), # (100, 900, 'cC', 5), # (100, 500, 'cD', 5), # (100, 100, 'cE', 5), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: for sel in range(48): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) selKey2 = key2 + "_" + str(sel) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=5) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_putfile_a5m(self): timeoutSecs = 500 csvFilenameList = [ # use different names for each parse # doesn't fail if gzipped? ("a5m.csv", 'A', None), ("a5m.csv", 'B', None), ("a5m.csv", 'C', None), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, key, trees) in csvFilenameList: csvPathname = csvFilename # creates csvFilename and csvFilename.hex keys parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRF(trees=trees,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_parse_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5000, 'cA', 60), (100, 6000, 'cB', 60), (100, 7000, 'cC', 60), (100, 8000, 'cD', 60), (100, 8200, 'cE', 60), (100, 8500, 'cF', 60), (100, 9000, 'cG', 60), (100, 10000, 'cI', 60), (100, 11000, 'cH', 60), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_many_cols_with_syn(self): ### h2b.browseTheCloud() csvFilename = "logreg_trisum_int_cat_10000x10.csv" csvPathname = "smalldata/logreg/" + csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(None, h2o.find_file(csvPathname), key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict = define_params() paramDict2 = {} for k in paramDict: # sometimes we have a list to pick from in the value. now it's just list of 1. paramDict2[k] = paramDict[k][0] y = 10 # FIX! what should we have for case? 1 should be okay because we have 1's in output col kwargs = {'y': y, 'max_iter': 50} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=20, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = ["covtype.data"] else: csvFilenameList = [ "covtype200x.data", "covtype200x.data", "covtype.data", "covtype.data", "covtype20x.data", "covtype20x.data", ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = "/home/0xdiag/datasets/standard" validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey["destination_key"]) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm["GLMModel"] coefficients = GLMModel["coefficients"] validationsList = GLMModel["validations"] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, "err", validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write(".") sys.stdout.flush()
def test_parse_many_cols_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5000, 'cA', 60), (100, 6000, 'cB', 60), (100, 7000, 'cC', 60), (100, 8000, 'cD', 60), (100, 8200, 'cE', 60), (100, 8500, 'cF', 60), (100, 9000, 'cG', 60), (100, 10000, 'cI', 60), (100, 11000, 'cH', 60), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_GLM2_many_cols_tridist(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 20, 'cB', 300), (10000, 30, 'cC', 300), (10000, 40, 'cD', 300), (10000, 50, 'cE', 300), (10000, 60, 'cF', 300), (10000, 70, 'cG', 300), (10000, 80, 'cH', 300), (10000, 90, 'cI', 300), (10000, 100, 'cJ', 300), (10000, 200, 'cK', 300), (10000, 300, 'cL', 300), (10000, 400, 'cM', 300), (10000, 500, 'cN', 300), (10000, 600, 'cO', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print "\nParse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] y = colCount kwargs = {'response': y} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 'C9', **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_parse_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5000, 'cA', 10), (100, 6000, 'cB', 10), (100, 7000, 'cC', 10), (100, 8000, 'cD', 10), (100, 8200, 'cE', 10), (100, 8500, 'cF', 10), (100, 9000, 'cG', 10), (100, 10000, 'cI', 10), (100, 11000, 'cH', 10), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=60) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_rf_multinomial_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_multinomial.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 400 colCount = 7 for trial in range (5): write_syn_dataset(csvPathname, totalRows, colCount, headerData) # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hexKey = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey, doSummary=True) start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) modelKey = rfView['drf_model']['_key'] h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=colCount+1, vpredict=1, expectedAuc=0.5, doAUC=False) h2b.browseJsonHistoryAsUrlLastMatch("RF")
def test_many_cols_and_values_with_syn(self): SEED = random.randint(0, sys.maxint) print "\nUsing random seed:", SEED # SEED = random.seed(SEED) SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 10, "cA", 5), (100, 1000, "cB", 5), # (100, 900, 'cC', 5), # (100, 500, 'cD', 5), # (100, 100, 'cE', 5), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: for sel in range(48): # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) selKey2 = key2 + "_" + str(sel) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=5) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_rf_kddcup_1999(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'kddcup_1999.data.gz' print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\ "compared to running with the parameters specified and matching the browser RF query defaults. " +\ "Also run the param for full scoring vs OOBE scoring." parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=300) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) for trials in range(4): print "\n" + csvFilename, "Trial #", trials start = time.time() kwargs = { 'response_variable': 'classifier', 'ntree': 200, 'gini': 1, 'class_weights': None, 'stratify': 0, # 'features': None, 'features': 7, 'ignore': None, 'sample': 67, 'bin_limit': 1024, 'depth': 2147483647, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': None, } if trials == 0: kwargs = {} elif trials == 1: kwargs['out_of_bag_error_estimate'] = None elif trials == 2: kwargs['out_of_bag_error_estimate'] = 0 elif trials == 3: kwargs['out_of_bag_error_estimate'] = 1 start = time.time() RFview = h2o_cmd.runRFOnly(trees=50, parseKey=parseKey, timeoutSecs=300, retryDelaySecs=1.0, **kwargs) print "RF end on ", csvFilename, 'took', time.time( ) - start, 'seconds' h2b.browseJsonHistoryAsUrlLastMatch("RFView")
def test_putfile_a5m(self): timeoutSecs = 500 csvFilenameList = [ # use different names for each parse # doesn't fail if gzipped? ("a5m.csv", 'A', None), ("a5m.csv", 'B', None), ("a5m.csv", 'C', None), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, key, trees) in csvFilenameList: csvPathname = csvFilename # creates csvFilename and csvFilename.hex keys parseResult = h2i.import_parse(path=csvPathname, schema='put', timeoutSecs=500) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseResult=parseResult, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can ## h2b.browseTheCloud() importFolderPath = "standard" validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_parse_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5000, "cA", 10), (100, 6000, "cB", 10), (100, 7000, "cC", 10), (100, 8000, "cD", 10), (100, 8200, "cE", 10), (100, 8500, "cF", 10), (100, 9000, "cG", 10), (100, 10000, "cI", 10), (100, 11000, "cH", 10), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] inspect = h2o_cmd.runInspect(None, parseKey["destination_key"], timeoutSecs=60) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_import_multi_syn_datasets(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) print "This imports a folder of csv files..i.e points to syn_datasets with no regex" print "Doesn't put anything in syn_datasets. When run with import folder redirected" print "to import S3, there is a syn_datasets with 100 files" print "FIX! When run locally, I should have some multi-files in", importFolderPath, "/syn_datasets?" timeoutSecs = 500 if h2o.nodes[0].redirect_import_folder_to_s3_path: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*_10000x200*", ] else: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2="syn_datasets.hex", timeoutSecs=500) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ "from all files num_rows:", "{:,}".format(inspect['num_rows']), \ "num_cols:", "{:,}".format(inspect['num_cols']) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() RFview = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) # so we can see! h2b.browseJsonHistoryAsUrlLastMatch("RFView") time.sleep(5)
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cI', 5), (100, 5000, 'cA', 5), (100, 6000, 'cB', 5), (100, 7000, 'cC', 5), (100, 8000, 'cD', 5), (100, 8200, 'cE', 5), (100, 8500, 'cF', 5), (100, 9000, 'cG', 5), (100, 11000, 'cH', 5), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cI', 5), (100, 11000, 'cH', 5), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=120) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_many_cols_01(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 5000, 'cA', 5), (100, 10000, 'cI', 5), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=120, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=120) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_GLM_with_logit_result_1(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 5, 'cA', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, \ "using random coefficients and intercept and logit eqn. for output" (coefficients, intercept) = gen_rand_equation(colCount, SEEDPERFILE) print coefficients, intercept write_syn_dataset(csvPathname, rowCount, colCount, coefficients, intercept, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'y': y, 'max_iter': 60, 'lambda': 1e-4, 'alpha': 0, 'weight': 1.0, 'n_folds': 3, 'beta_epsilon': 1e-4, 'thresholds': 0.5, } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(5)
def test_exec2_int2cat_nested(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 10, 'cA', 100), (1000, 20, 'cB', 100), (1000, 30, 'cC', 100), (1000, 40, 'cD', 100), (1000, 10, 'cE', 100), (1000, 20, 'cF', 100), (1000, 30, 'cG', 100), (1000, 40, 'cH', 100), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);', '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);' \ ] exprList = [ '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);', '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);', '<keyX>[,<col1>] = factor(<keyX>[,1]);', '<keyX>[,1] = factor(<keyX>[,<col2>]);' \ ] exprList = [ '<keyX>[,2] = factor(<keyX>[,2])', ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename print "\nNow running the exec commands across all input cols" colResultList = h2e.exec_expr_list_across_cols(None, exprList, hex_key, maxCol=colCount, timeoutSecs=30, incrementingResult=False) print "\nexec colResultList", colResultList if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_many_cols_real(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 100, 'cA', 300), (1000, 200, 'cB', 300), (1000, 300, 'cC', 300), (1000, 400, 'cD', 300), (1000, 500, 'cE', 300), (1000, 1000, 'cJ', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'y': y, 'max_iter': 50, 'case': '1', 'case_mode': '=', 'lambda': 1e-4, 'alpha': 0.6 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) # try new offset/view inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], offset=100, view=100)
def test_GLM_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA.hex', 100), (10000, 20, 'cB.hex', 200), (10000, 30, 'cC.hex', 300), (10000, 40, 'cD.hex', 400), (10000, 50, 'cE.hex', 500), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))', ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=90) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, timeoutSecs=90, incrementingResult=False) print "\nexec colResultList", colResultList paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] # since we add the output twice, it's no longer colCount-1 y = colCount kwargs = {'y': y, 'max_iter': 50, 'case': 1} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' # only col y-1 (next to last)doesn't get renamed in coefficients # due to enum/categorical expansion print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(3) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_cs_training(self): h2o_cmd.runRF(trees=100, depth=100, csvPathname=h2o.find_file( 'smalldata/kaggle/creditsample-training.csv.gz'), timeoutSecs=300, response_variable=1) h2b.browseJsonHistoryAsUrlLastMatch("RFView")
def test_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 10, 'cA', 100), (1000, 20, 'cB', 100), (1000, 30, 'cC', 100), (1000, 40, 'cD', 100), (1000, 10, 'cE', 100), (1000, 20, 'cF', 100), (1000, 30, 'cG', 100), (1000, 40, 'cH', 100), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! if 1 == 0: exprList = [ '<keyX> = colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' + 'colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' + '<keyX>[0]' + '))))', ] else: exprList = [ '<keyX> = colSwap(<keyX>,<col1>,' + '<keyX>[0]' + ')', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols( None, exprList, key2, maxCol=colCount, timeoutSecs=30, incrementingResult=False) print "\nexec colResultList", colResultList if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_GLM_with_logit_result_1_NA(self): print "Put NAs in col 1...all of col 1 is empty" SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 5, 'cA', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, \ "using random coefficients and intercept and logit eqn. for output" (coefficients, intercept) = gen_rand_equation(colCount, SEEDPERFILE) print coefficients, intercept write_syn_dataset(csvPathname, rowCount, colCount, coefficients, intercept, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount print "Don't specify x to GLM, even though there are NA's on col 1" # FIX! should check inspect missing n = inspect['cols'][1]['num_missing_values'] print "num_missing_values in col 1:", n self.assertEqual(n, rowCount, \ msg="Expect col 1 to have num_missing_values: %d equal to rowCount: %d" % (n, rowCount)) kwargs = { 'y': y, 'max_iter': 60, 'lambda': 1e-4, 'alpha': 0, 'weight': 1.0, 'n_folds': 3, 'beta_epsilon': 1e-4, 'thresholds': 0.5, } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(5)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt' ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_json_browse_both_exec(self): lenNodes = len(h2o.nodes) csvPathname = 'standard/covtype.data' hex_key = 'c.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "\nParse key is:", parseResult['destination_key'] ## h2b.browseTheCloud() # for trial in range(53): trial = 0 while (trial < 100): for exprTemplate in exprList: trial = trial + 1 n = trial colX = random.randint(1,54) row = random.randint(1,400000) execExpr = exprTemplate execExpr = re.sub('<col1>',str(colX),execExpr) execExpr = re.sub('<col2>',str(colX+1),execExpr) execExpr = re.sub('<n>',str(n),execExpr) execExpr = re.sub('<row>',str(row),execExpr) execExpr = re.sub('<keyX>',str(hex_key),execExpr) # pick a random node to execute it on randNode = random.randint(0,lenNodes-1) print "\nexecExpr:", execExpr, "on node", randNode start = time.time() resultExec = h2o_cmd.runExec(node=h2o.nodes[randNode], execExpr=execExpr, timeoutSecs=15) h2o.verboseprint(h2o.dump_json(resultExec)) # print(h2o.dump_json(resultExec)) # FIX! race conditions. If json is done, does that mean you can inspect it?? # wait until the 2nd iteration, which will guarantee both Result1 and Result2 exist if trial > 1: inspectMe = random.choice(inspectList) resultInspect = h2o.nodes[0].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[1].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) resultInspect = h2o.nodes[2].inspect(inspectMe) h2o.verboseprint(h2o.dump_json(resultInspect)) # FIX! if we race the browser doing the exec too..it shouldn't be a problem? # might be a bug? # WARNING! we can't browse the Exec url history, since that will # cause the Exec to execute again thru the browser..i.e. it has side effects # just look at the last inspect, which should be the resultInspect! # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2b.browseJsonHistoryAsUrlLastMatch("Exec") h2o.check_sandbox_for_errors() print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_many_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cI', 5), (100, 5000, 'cA', 5), (100, 6000, 'cB', 5), (100, 7000, 'cC', 5), (100, 8000, 'cD', 5), (100, 8200, 'cE', 5), (100, 8500, 'cF', 5), (100, 9000, 'cG', 5), (100, 11000, 'cH', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=120, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) # try new offset/view inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=100, view=100) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=99, view=89) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=-1, view=53)
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cI', 5), (100, 5000, 'cA', 5), (100, 6000, 'cB', 5), (100, 7000, 'cC', 5), (100, 8000, 'cD', 5), (100, 8200, 'cE', 5), (100, 8500, 'cF', 5), (100, 9000, 'cG', 5), (100, 11000, 'cH', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) # try new offset/view inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], offset=100, view=100) inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], offset=99, view=89) inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], offset=-1, view=53)
def test_from_import(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameAll = [ "covtype.data", "covtype20x.data", # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11000, 0, 'cA', 180), (100, 10000, 1, 'cB', 180), (100, 9000, 0, 'cC', 180), (100, 8000, 1, 'cD', 180), (100, 7000, 0, 'cE', 180), (100, 6000, 1, 'cF', 180), (100, 5000, 0, 'cG', 180), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 # it's interesting to force the first enum row to be used as header or not # with many cols, we tend to hit limits about stuff fitting in a chunk (header or data) for (rowCount, colCount, header, hex_key, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, header, SEED) parseResult = h2i.import_parse(path=csvPathname, schema='put', header=header, hex_key=hex_key, timeoutSecs=60) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) # try new offset/view inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=100, view=100) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=99, view=89) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=-1, view=53)
def test_exec2_int2cat_nested(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 10, 'cA', 100), (1000, 20, 'cB', 100), (1000, 30, 'cC', 100), (1000, 40, 'cD', 100), (1000, 10, 'cE', 100), (1000, 20, 'cF', 100), (1000, 30, 'cG', 100), (1000, 40, 'cH', 100), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>[,<col2>] = <keyX>[,<col1>];', '<keyX>[,<col1>] = <keyX>[,1];', '<keyX>[,1] = <keyX>[,<col2>];', '<keyX>[,<col2>] = <keyX>[,<col1>];', '<keyX>[,<col1>] = <keyX>[,1];', '<keyX>[,1] = <keyX>[,<col2>];' \ ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename print "\nNow running the exec commands across all input cols" colResultList = h2e.exec_expr_list_across_cols( None, exprList, hex_key, maxCol=colCount, timeoutSecs=30, incrementingResult=False) print "\nexec colResultList", colResultList if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_import_multi_syn_datasets(self): # just do the import folder once # importFolderPath = "/home/hduser/hdfs_datasets" importFolderPath = '/home/0xdiag/datasets' print "This imports a folder of csv files..i.e points to syn_datasets with no regex" print "Doesn't put anything in syn_datasets. When run with import folder redirected" print "to import S3, there is a syn_datasets with 100 files" print "FIX! When run locally, I should have some multi-files in", importFolderPath, "/syn_datasets?" timeoutSecs = 500 if h2o.nodes[0].redirect_import_folder_to_s3_path: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*_10000x200*", ] else: csvFilenameAll = [ # FIX! ..just folder doesn't appear to work. add regex # need a destination_key...h2o seems to use the regex if I don't provide one ### "syn_datasets/*", "syn_datasets/*", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2="syn_datasets.hex", timeoutSecs=500) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ "from all files num_rows:", "{:,}".format(inspect['num_rows']), \ "num_cols:", "{:,}".format(inspect['num_cols']) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() RFview = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # so we can see! h2b.browseJsonHistoryAsUrlLastMatch("RFView") time.sleep(5)
def test_rf_kddcup_1999(self): # since we'll be waiting, pop a browser h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) csvFilename = 'kddcup_1999.data.gz' print "Want to see that I get similar results when using H2O RF defaults (no params to json)" +\ "compared to running with the parameters specified and matching the browser RF query defaults. " +\ "Also run the param for full scoring vs OOBE scoring." parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=300) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None,parseKey['destination_key']) for trials in range(4): print "\n" + csvFilename, "Trial #", trials start = time.time() kwargs = { 'response_variable': 'classifier', 'ntree': 200, 'gini': 1, 'class_weights': None, 'stratify': 0, # 'features': None, 'features': 7, 'ignore': None, 'sample': 67, 'bin_limit': 1024, 'depth': 2147483647, 'seed': 784834182943470027, 'parallel': 1, 'exclusive_split_limit': None, } if trials == 0: kwargs = {} elif trials == 1: kwargs['out_of_bag_error_estimate'] = None elif trials == 2: kwargs['out_of_bag_error_estimate'] = 0 elif trials == 3: kwargs['out_of_bag_error_estimate'] = 1 start = time.time() RFview = h2o_cmd.runRFOnly(trees=50,parseKey=parseKey, timeoutSecs=300, retryDelaySecs=1.0, **kwargs) print "RF end on ", csvFilename, 'took', time.time() - start, 'seconds' h2b.browseJsonHistoryAsUrlLastMatch("RFView")
def test_GLM_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (10000, 100, 'cA', 100), (10000, 200, 'cB', 200), (10000, 300, 'cC', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'y': y, 'max_iter': 50, 'case': 1, 'family': 'binomial', 'lambda': 0, 'alpha': 0, 'max_iter': 50, 'weight': 1.0, 'thresholds': 0.5, 'n_folds': 2, 'beta_epsilon':1.0E-4, } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(10) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(10)
def test_GLM_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (10000, 100, 'cA', 100), (10000, 200, 'cB', 200), (10000, 300, 'cC', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'y': y, 'max_iter': 50, 'case': 1, 'family': 'binomial', 'lambda': 0, 'alpha': 0, 'max_iter': 50, 'weight': 1.0, 'thresholds': 0.5, 'n_folds': 2, 'beta_eps':1.0E-4, } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(10) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(10)
def test_GLM_many_cols_tridist(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 20, 'cB', 300), (10000, 30, 'cC', 300), (10000, 40, 'cD', 300), (10000, 50, 'cE', 300), (10000, 60, 'cF', 300), (10000, 70, 'cG', 300), (10000, 80, 'cH', 300), (10000, 90, 'cI', 300), (10000, 100, 'cJ', 300), (10000, 200, 'cK', 300), (10000, 300, 'cL', 300), (10000, 400, 'cM', 300), (10000, 500, 'cN', 300), (10000, 600, 'cO', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "\nParse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] y = colCount kwargs = {'y': y} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 10, 'cA', 100), (1000, 20, 'cB', 100), (1000, 30, 'cC', 100), (1000, 40, 'cD', 100), (1000, 10, 'cE', 100), (1000, 20, 'cF', 100), (1000, 30, 'cG', 100), (1000, 40, 'cH', 100), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! if 1==0: exprList = [ '<keyX> = colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' + 'colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' + '<keyX>[0]' + '))))', ] else: exprList = [ '<keyX> = colSwap(<keyX>,<col1>,' + '<keyX>[0]' + ')', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, timeoutSecs=30, incrementingResult=False) print "\nexec colResultList", colResultList if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_parse_rand_utf8_angle_start(self): h2b.browseTheCloud() h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 1, 'cA', 120), (1000, 1, 'cG', 120), (1000, 1, 'cH', 120), ] print "What about messages to log (INFO) about unmatched quotes (before eol)" # got this ..trying to avoid for now # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO print "what we used" print "ordinalChoices:", ordinalChoices for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60) print "inspect:", h2o.dump_json(inspect) numRows = inspect['numRows'] # Don't check for now..going to get empty rows # self.assertEqual(numRows, rowCount, msg='Wrong numRows likely due to unmatched " row going to NA: %s %s' % (numRows, rowCount)) numCols = inspect['numCols'] # because of our double quote termination hack above if DOUBLE_QUOTE: self.assertTrue((numCols==colCount or numCols==colCount+1), msg='Wrong numCols: %s %s' % (numCols, colCount)) else: self.assertTrue(numCols==colCount, msg='Wrong numCols: %s %s' % (numCols, colCount)) for k in range(colCount): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt)) stype = inspect['cols'][k]['type'] self.assertEqual("Enum", stype, msg='col %s type %s should be Enum' % (k, stype)) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_B_putfile_files(self): timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" # "covtype20x.data", # "billion_rows.csv.gz", csvFilenameList = [ ("covtype.data", 1), ("covtype20x.data", 1), # ("covtype200x.data", None), # ("a5m.csv", None), # ("a10m.csv", None), # ("a100m.csv", None), # ("a200m.csv", None), # ("a400m.csv", None), # ("a600m.csv", None), # ("100million_rows.csv, None"), # ("200million_rows.csv", None), # ("billion_rows.csv.gz", 1), # memory issue on one machine. no RF # ("new-poker-hand.full.311M.txt.gz", None), ] # pop open a browser on the cloud h2b.browseTheCloud() for (csvFilename, trees) in csvFilenameList: csvPathname = h2o.find_file('/home/0xdiag/datasets/' + csvFilename) # creates csvFilename and csvFilename.hex keys parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=csvFilename, timeoutSecs=500) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # constrain depth to 25 if trees is not None: RFview = h2o_cmd.runRFOnly(trees=trees,depth=25,parseKey=parseKey, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 5, 'cA', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, \ "using random coefficients and intercept and logit eqn. for output" (coefficients, intercept) = gen_rand_equation(colCount, SEEDPERFILE) print coefficients, intercept write_syn_dataset(csvPathname, rowCount, colCount, coefficients, intercept, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = {'y': y, 'max_iter': 60, 'lambda': 1e-4, 'alpha': 0, 'weight': 1.0, # what about these? # 'link': [None, 'logit','identity', 'log', 'inverse'], 'n_folds': 3, 'beta_epsilon': 1e-4, 'thresholds': 0.5, } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(5)
def test_many_cols_long_enums(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) SEPARATOR = ord(',') parseResult = h2i.import_parse( path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, header=0, separator=SEPARATOR ) # don't force header..we have NAs in the rows, and NAs mess up headers print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) # try new offset/view inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
def test_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (100, 11000, 0, 'cA', 180), # (100, 10000, 1, 'cB', 180), # (100, 8000, 1, 'cD', 180), # (100, 7000, 0, 'cE', 180), # (100, 6000, 1, 'cF', 180), (100, 1000, 0, 'cH', 120), (100, 1000, 1, 'cI', 120), (100, 2000, 1, 'cI', 120), (100, 3000, 1, 'cI', 120), (100, 4000, 1, 'cI', 120), (100, 5000, 0, 'cG', 180), (100, 9000, 0, 'cC', 180), (100, 10000, 1, 'cB', 180), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 # it's interesting to force the first enum row to be used as header or not # with many cols, we tend to hit limits about stuff fitting in a chunk (header or data) for (rowCount, colCount, header, hex_key, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, header, SEED) parseResult = h2i.import_parse(path=csvPathname, schema='put', header=header, hex_key=hex_key, timeoutSecs=timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) # try new offset/view inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=100, view=100) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=99, view=89) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=-1, view=53)
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "TEST-poker1000.csv", ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs( path='/datasets', schema='maprfs') for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile( csvFilename=csvFilename, path='/datasets', schema='maprfs', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), (5, 100, 'cA', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) SEPARATOR = ord(',') parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10, separator=SEPARATOR, header=1) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) # try new offset/view inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
def test_GLM_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (10000, 100, 'cA', 100), (10000, 200, 'cB', 200), (10000, 300, 'cC', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] y = colCount kwargs = {'y': y, 'max_iter': 50, 'case': 1} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' # only col Y-1 (next to last)doesn't get renamed in coefficients due to enum/categorical expansion print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(15) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(15)
def pollWaitJobs(pattern=None, timeoutSecs=30, pollTimeoutSecs=30, retryDelaySecs=5, benchmarkLogging=None): anyBusy = True waitTime = 0 while anyBusy: # timeout checking has to move in here now! just count loops anyBusy = False a = h2o.nodes[0].jobs_admin(timeoutSecs=pollTimeoutSecs) ## print "jobs_admin():", h2o.dump_json(a) jobs = a["jobs"] patternKeys = [] for j in jobs: ### h2o.verboseprint(j) # save the destination keys for any GLMModel in progress if pattern and pattern in j["destination_key"]: patternKeys.append(j["destination_key"]) if j["end_time"] == "": anyBusy = True h2o.verboseprint( "waiting", waitTime, "secs, still not done - ", "destination_key:", j["destination_key"], "progress:", j["progress"], "cancelled:", j["cancelled"], "end_time:", j["end_time"], ) h2b.browseJsonHistoryAsUrlLastMatch("Jobs") if anyBusy and waitTime > timeoutSecs: print h2o.dump_json(jobs) raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds") sys.stdout.write(".") sys.stdout.flush() time.sleep(retryDelaySecs) waitTime += retryDelaySecs # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack) # test would pass ['cpu','disk','jstack'] kind of list if benchmarkLogging: h2o.cloudPerfH2O.get_log_save(benchmarkLogging) return patternKeys
def test_rf_multinomial_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_multinomial.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 400 colCount = 7 for trial in range(5): write_syn_dataset(csvPathname, totalRows, colCount, headerData) # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hexKey = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey, doSummary=True) start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) modelKey = rfView['drf_model']['_key'] h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=colCount + 1, vpredict=1, expectedAuc=0.5, doAUC=False) h2b.browseJsonHistoryAsUrlLastMatch("RF")
def test_rf_from_import_hosts(self): # just do the import folder once timeoutSecs = 500 # "covtype169x.data", # "covtype.13x.shuffle.data", # "3G_poker_shuffle" csvFilenameList = [ "billion_rows.csv.gz", # "covtype20x.data", ] importFolderPath = "standard" # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=500) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename start = time.time() # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 RFview = h2o_cmd.runRF(trees=1, depth=25, parseResult=parseResult, timeoutSecs=timeoutSecs) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_rf_tnc3_fvec(self): h2o.beta_features = True csvPathname = 'tnc3.csv' print "\n" + csvPathname hex_key = "tnc3.hex" ### h2b.browseTheCloud() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10, retryDelaySecs=0.25, header=1) print "Parse result['Key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") if 1==1: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser" print 'The good case with ignore="boat,body"' rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, retryDelaySecs=0.25, ignored_cols_by_name="boat,body") inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrl(retryDelaySecs=0.5) #****************** if 1==0: colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10, retryDelaySecs=0.25) print "\ncolResultList after char swap", colResultList if 1==1: print "\nNow the bad case (no ignore)" rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, retryDelaySecs=0.25) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrl(retryDelaySecs=0.5)
def pollWaitJobs(pattern=None, timeoutSecs=30, pollTimeoutSecs=30, retryDelaySecs=5, benchmarkLogging=None): anyBusy = True waitTime = 0 while (anyBusy): # timeout checking has to move in here now! just count loops anyBusy = False a = h2o.nodes[0].jobs_admin(timeoutSecs=pollTimeoutSecs) ## print "jobs_admin():", h2o.dump_json(a) jobs = a['jobs'] patternKeys = [] for j in jobs: ### h2o.verboseprint(j) # save the destination keys for any GLMModel in progress if pattern and pattern in j['destination_key']: patternKeys.append(j['destination_key']) if j['end_time'] == '': anyBusy = True h2o.verboseprint("waiting", waitTime, "secs, still not done - ",\ "destination_key:", j['destination_key'], \ "progress:", j['progress'], \ "cancelled:", j['cancelled'],\ "end_time:", j['end_time']) h2b.browseJsonHistoryAsUrlLastMatch("Jobs") if (anyBusy and waitTime > timeoutSecs): print h2o.dump_json(jobs) raise Exception("Some queued jobs haven't completed after", timeoutSecs, "seconds") sys.stdout.write('.') sys.stdout.flush() time.sleep(retryDelaySecs) waitTime += retryDelaySecs # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack) # test would pass ['cpu','disk','jstack'] kind of list if benchmarkLogging: h2o.cloudPerfH2O.get_log_save(benchmarkLogging) return patternKeys
def test_many_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cI', 5), (100, 5000, 'cA', 5), (100, 6000, 'cB', 5), (100, 7000, 'cC', 5), (100, 8000, 'cD', 5), (100, 8200, 'cE', 5), (100, 8500, 'cF', 5), (100, 9000, 'cG', 5), (100, 11000, 'cH', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, hex_key, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) # try new offset/view inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=100, view=100) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=99, view=89) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], offset=-1, view=53)
def test_KMeans_twit(self): csvFilename = "Twitter2DB.txt" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/' + csvFilename) # h2b.browseTheCloud() # parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", separator=9) # force tab sep parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers # should check the means? # FIX! have to fix these to right answers expected = [ # expected centers are from R. rest is just from h2o ([310527.2, 13433.89], 11340, None), ([5647967.1, 40487.76], 550, None), ([21765291.7, 93129.26], 14, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) for trial in range(2): kwargs = { 'k': 3, 'max_iter': 50, 'epsilon': 1e-4, 'normalize': 0, 'cols': '0,1', 'initialization': 'Furthest', # 'initialization': 'PlusPlus', 'destination_key': 'kmeans_dest_key', # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) if 1 == 0: h2b.browseJsonHistoryAsUrlLastMatch("KMeansScore") h2b.browseJsonHistoryAsUrlLastMatch("KMeansApply") h2b.browseJsonHistoryAsUrlLastMatch("KMeans") time.sleep(3600) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
def test_tnc3_ignore(self): csvFilename = 'tnc3.csv' csvPathname = h2o.find_file('smalldata/' + csvFilename) print "\n" + csvPathname key2 = "tnc3.hex" h2b.browseTheCloud() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10, header=1) print "Parse result['Key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if 1==1: lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser" print 'The good case with ignore="boat,body"' rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, ignore="boat,body", csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if 1==0: colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if 1==1: print "\nNow the bad case (no ignore)" rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_GLM_twovalues(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename # H2O might not do whitespace stripping on numbers correctly, when , is {SEP} # GLM will auto expand categoricals..so if we have more coefficients than expected # that means it didn't parse right # mix in space/tab combos # just done like this for readability rowDataTrueRaw = \ "<sp>1,\ 0<sp>,\ <tab>65,\ 1<tab>,\ <sp><tab>2,\ 1<sp><tab>,\ <tab><sp>1,\ 4<tab><sp>,\ <tab><tab>1,\ 4<tab><tab>,\ <sp><sp>1,\ 4<sp><sp>" rowDataTrue = re.sub("<sp>"," ", rowDataTrueRaw) rowDataTrue = re.sub("<tab>"," ", rowDataTrue) rowDataFalse = \ "0,\ 1,\ 0,\ -1,\ -2,\ -1,\ -1,\ -4,\ -1,\ -4,\ -1,\ -4" twoValueList = [ ('A','B',0, 14), ('A','B',1, 14), (0,1,0, 12), (0,1,1, 12), (0,1,'NaN', 12), (1,0,'NaN', 12), (-1,1,0, 12), (-1,1,1, 12), (-1e1,1e1,1e1, 12), (-1e1,1e1,-1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() key = csvFilename + "_" + str(trial) kwargs = {'case': case, 'y': 10, 'family': 'binomial', 'alpha': 0, 'beta_eps': 0.0002} # default takes 39 iterations? play with alpha/beta glm = h2o_cmd.runGLM(csvPathname=csvPathname, key=key) h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs) # check that the number of entries in coefficients is right (12 with intercept) coeffNum = len(glm['GLMModel']['coefficients']) if (coeffNum!=coeffNum): raise Exception("Should be " + coeffNum + " coefficients in result. %s" % coeffNum) print "trial #", trial, "glm end on ", csvFilename, 'took', time.time() - start, 'seconds' h2b.browseJsonHistoryAsUrlLastMatch("GLM") h2o.check_sandbox_for_errors() trial += 1
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA.hex', 100), (10000, 20, 'cB.hex', 200), (10000, 30, 'cC.hex', 300), (10000, 40, 'cD.hex', 400), (10000, 50, 'cE.hex', 500), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))', ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=90) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols( None, exprList, key2, maxCol=colCount, timeoutSecs=90, incrementingResult=False) print "\nexec colResultList", colResultList paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] # since we add the output twice, it's no longer colCount-1 y = colCount kwargs = {'y': y, 'max_iter': 50, 'case': 1} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' # only col y-1 (next to last)doesn't get renamed in coefficients # due to enum/categorical expansion print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(3) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)