def test_GLM_poisson_1(self): csvFilename = 'covtype.data' csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1 == 0): print "WARNING: just doing the first 33 features, for comparison to ??? numbers" # pythonic! x = ",".join(map(str, range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" kwargs = { 'x': x, 'y': y, 'family': 'poisson', 'link': 'log', 'n_folds': 0, 'max_iter': max_iter, 'beta_epsilon': 1e-3 } timeoutSecs = 120 # L2 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
def test_tnc3_ignore(self): csvFilename = 'tnc3_10.csv' csvPathname = h2o.find_file('smalldata/' + csvFilename) print "\n" + csvPathname key2 = "tnc3.hex" h2b.browseTheCloud() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10) print "Parse result['Key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(10) if (1==0): lenNodes = len(h2o.nodes) colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after num swap", colResultList if (1==1): start = time.time() kwargs = {'y': 13, 'num_cross_validation_folds': 6} # hmm. maybe we should update to use key as input # in case exec is used to change the parseKey # in any case, the destination_key in parseKey was what was updated # so if we Exec, it's correct. glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") #****************** if (1==0): colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10) print "\ncolResultList after char swap", colResultList if (1==1): start = time.time() kwargs = {'y': 13, 'num_cross_validation_folds': 6} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") ### time.sleep(3600) h2b.browseJsonHistoryAsUrlLastMatch("RFView") if not h2o.browse_disable: ### print "\n <ctrl-C> to quit sleeping here" ### time.sleep(1500) pass
def test_GLM_covtype(self): csvFilename = 'covtype.data' csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,timeoutSecs=10) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) if (1==0): print "WARNING: just doing the first 33 features, for comparison to allstate numbers" # pythonic! x = ",".join(map(str,range(33))) else: x = "" print "WARNING: max_iter set to 8 for benchmark comparisons" max_iter = 8 y = "54" # L2 args = { 'x': x, 'y': y, 'family': 'binomial', 'link': 'logit', 'num_cross_validation_folds': 0, 'case_mode': '=', 'case': 1, 'max_iter': max_iter, 'beta_eps': 1e-3} timeoutSecs = 120 start = time.time() kwargs.update({'alpha': 0, 'lambda': 0}) glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L2) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # Elastic kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (Elastic) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs) # L1 kwargs.update({'alpha': 1, 'lambda': 1e-4}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm (L1) end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 13, **kwargs)
def test_C_prostate(self): print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for appendx in xrange(9): if (appendx == 0): print "\n0 is member ID. not used" elif (appendx == 1): print "\n1 is output." else: if x == "": x = str(appendx) else: x = x + "," + str(appendx) sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'num_cross_validation_folds': 5} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs)
def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" x = "" # cols 0-13. 3 is output # no member id in this one for appendx in xrange(14): if (appendx == 3): print "\n3 is output." else: if x == "": x = str(appendx) else: x = x + "," + str(appendx) csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with num_cross_validation_folds print "Not doing num_cross_validation_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_GLM_model_key_unique(self): modelKeyDict = {} for trial in range (1,5): csvPathname = 'iris/iris2.csv' start = time.time() # h2o.py now sets destination_key for a fixed default model name, # we want h2o to create model names for this test, so use none here kwargs = {'destination_key': None, 'y':4, 'family': 'binomial', 'case': 1, 'case_mode': '>'} # make sure each parse is unique dest key (not in use hex_key = "iris2_" + str(trial) + ".hex" parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10, noPoll=True, **kwargs ) glmResult = h2o_cmd.runGLMOnly(parseResult=parseResutl, timeoutSecs=10, noPoll=True, **kwargs ) print "GLM #%d" % trial, "started on ", csvPathname, 'took', time.time() - start, 'seconds' model_key = glmResult['destination_key'] print "GLM model_key:", model_key if model_key in modelKeyDict: raise Exception("same model_key used in GLM #%d that matches prior GLM #%d" % (trial, modelKeyDict[model_key])) modelKeyDict[model_key] = trial # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_GLM_params_rand2_8977501266014959103(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) # SEED = random.randint(0, sys.maxint) SEED = 8977501266014959103 # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'alpha': 0, 'lambda': 0, 'case': 1, 'n_folds': 1 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_syn_2659x1049x2enum(self): csvFilename = "syn_2659x1049x2enum.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = params glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=240, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_C_prostate(self): print "\nStarting prostate.csv" # columns start at 0 y = "1" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,9): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM. standardize normalizes the data. kwargs = {'x': x, 'y': y, 'n_folds': 5,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize':1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': 'binomial', 'max_iter': 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds'] * 10 + params['max_iter'] * 10) glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_C_prostate(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'n_folds': 5} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) sys.stdout.write('.') sys.stdout.flush() h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write('.') sys.stdout.flush()
def test_glm_covtype_single_cols(self): timeoutSecs = 10 csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "\n" + csvPathname # columns start at 0 y = "54" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) print "GLM binomial wth 1 X column at a time" print "Result check: abs. value of coefficient and intercept returned are bigger than zero" for colX in xrange(54): if x == "": x = str(colX) else: # x = x + "," + str(colX) x = str(colX) sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds'
def test_B_importFolder_files(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/standard" h2i.setupImportFolder(None, importFolderPath) timeoutSecs = 1500 csvFilenameAll = [ # quick test first "covtype.data", # then the real thing "billion_rows.csv.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud ### h2b.browseTheCloud() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=timeoutSecs, pollTimeoutSecs=60) elapsed = time.time() - start print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs) # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename kwargs = { 'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1 } # one coefficient is checked a little more colX = 0 # L2 kwargs.update({'alpha': 0, 'lambda': 0}) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) sys.stdout.write('\n.') sys.stdout.flush()
def test_GLM_syn_2659x1049x2enum(self): csvFilename = "syn_2659x1049x2enum.csv" csvPathname = 'logreg' + '/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') kwargs = params glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=240, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_B_benign(self): print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(4,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y # solver can be ADMM kwargs = {'x': x, 'y': y,\ 'expert': 1, 'lsm_solver': 'GenGradient', 'standardize': 1, 'n_folds': 1} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=30, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() sys.stdout.write('.') sys.stdout.flush()
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30) y = "10" x = "" # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values 1,-1. need to specify case kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python)" h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can ## h2b.browseTheCloud() importFolderPath = "standard" validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_princeton(self): # filename, y, timeoutSecs # these are all counts? using gaussian? csvFilenameList = [ ('cuse.dat', 'gaussian', 3, 5), # notUsing ('cuse.dat', 'gaussian', 4, 5), # using ('copen.dat', 'gaussian', 4, 5), ('housing.raw', 'gaussian', 4, 5), ] trial = 0 for (csvFilename, family, y, timeoutSecs) in csvFilenameList: csvPathname1 = 'logreg/princeton/' + csvFilename fullPathname1 = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_stripped.csv' h2o_util.file_strip_trailing_spaces(fullPathname1, csvPathname2) kwargs = {'n_folds': 0, 'family': family, 'link': 'familyDefault', 'y': y} parseResult = h2i.import_parse(path=csvPathname2, schema='put', timeoutSecs=timeoutSecs, **kwargs) start = time.time() glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time() - start, 'seconds' trial += 1 print "\nTrial #", trial
def test_C_hhp_107_01(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) # pop open a browser on the cloud h2b.browseTheCloud() # build up the parameter string in X y = "106" x = "" # go right to the big X and iterate on that case ### for trial in range(2): for trial in range(2): print "\nTrial #", trial, "start" print "\nx:", x print "y:", y start = time.time() kwargs = {'y': y} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=200, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs) h2o.check_sandbox_for_errors() ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") print "\nTrial #", trial
def test_GLM_big1_nopoll(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname y = "106" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) glmInitial = [] # dispatch multiple jobs back to back start = time.time() for jobDispatch in range(40): kwargs = {'x': x, 'y': y, 'n_folds': 1} # FIX! what model keys do these get? glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, noPoll=True, **kwargs) glmInitial.append(glm) print "glm job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch timeoutSecs = 200 h2o_jobs.pollWaitJobs(pattern='GLMModel', timeoutSecs=timeoutSecs, retryDelaySecs=10) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected for glm in glmInitial: print "Checking completed job, with no polling:", glm a = h2o.nodes[0].poll_url(glm['response'], noPoll=True) h2o_glm.simpleCheckGLM(self, a, 57, **kwargs)
def test_C_hhp_107_01(self): csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz") print "\n" + csvPathname y = "106" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) for trial in xrange(3): sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, 57, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\nTrial #", trial
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = ["covtype.data"] else: csvFilenameList = [ "covtype200x.data", "covtype200x.data", "covtype.data", "covtype.data", "covtype20x.data", "covtype20x.data", ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = "/home/0xdiag/datasets/standard" validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey["destination_key"]) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm["GLMModel"] coefficients = GLMModel["coefficients"] validationsList = GLMModel["validations"] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, "err", validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write(".") sys.stdout.flush()
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with n_folds print "Not doing n_folds with benign. Fails with 'unable to solve?'" glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_params_rand2_newargs(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') key = 'covtype.20k' parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 10 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'x': x, 'y': y, 'case': -1} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10) glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_many_cols_with_syn(self): ### h2b.browseTheCloud() csvFilename = "logreg_trisum_int_cat_10000x10.csv" csvPathname = "smalldata/logreg/" + csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(None, h2o.find_file(csvPathname), key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict = define_params() paramDict2 = {} for k in paramDict: # sometimes we have a list to pick from in the value. now it's just list of 1. paramDict2[k] = paramDict[k][0] y = 10 # FIX! what should we have for case? 1 should be okay because we have 1's in output col kwargs = {'y': y, 'max_iter': 50} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=20, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'num_cross_validation_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=150, parseKey=parseKey, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'num_cross_validation_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_params_rand2(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k") # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_glm_covtype_single_cols(self): timeoutSecs = 10 csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "\n" + csvPathname # columns start at 0 y = "54" x = "" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=15) print "GLM binomial wth 1 X column at a time" print "Result check: abs. value of coefficient and intercept returned are bigger than zero" for colX in xrange(54): if x == "": x = str(colX) else: # x = x + "," + str(colX) x = str(colX) sys.stdout.write('.') sys.stdout.flush() print "\nx:", x print "y:", y start = time.time() kwargs = {'x': x, 'y': y, 'n_folds': 6, 'case': 2} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # Took num_cross_validation_folds out, because GLM doesn't include num_cross_validation_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'x': x, 'y': y, 'case': -1} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_C_prostate_w_predict(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting prostate.csv" # columns start at 0 y = "1" x = "" csvFilename = "prostate.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") for maxx in range(2,6): x = range(maxx) x.remove(0) # 0 is member ID. not used x.remove(1) # 1 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y, 'n_folds': 5} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) # ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON h2o_glm.simpleCheckGLM(self, glm, 'AGE', **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] print "Doing predict with same dataset, and the GLM model" h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key']) h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download()
def test_B_benign_w_predict(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} # fails with n_folds glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=15, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] print "Doing predict with same dataset, and the GLM model" h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=parseKey['destination_key'])
def test_GLM_gamma_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 24 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_B_benign(self): h2o.nodes[0].log_view() namelist = h2o.nodes[0].log_download() print "\nStarting benign.csv" csvFilename = "benign.csv" csvPathname = 'logreg' + '/' + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=csvFilename + ".hex", schema='put') # columns start at 0 y = "3" # cols 0-13. 3 is output # no member id in this one for maxx in range(11,14): x = range(maxx) x.remove(3) # 3 is output x = ",".join(map(str,x)) print "\nx:", x print "y:", y kwargs = {'x': x, 'y': y} glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=15, **kwargs) # no longer look at STR? h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) sys.stdout.write('.') sys.stdout.flush()
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*20) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_many_cols_int2cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA.hex', 100), (10000, 20, 'cB.hex', 200), (10000, 30, 'cC.hex', 300), (10000, 40, 'cD.hex', 400), (10000, 50, 'cE.hex', 500), ] ### h2b.browseTheCloud() # we're going to do a special exec across all the columns to turn them into enums # including the duplicate of the output! exprList = [ '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))', ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])', ] for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=90) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename print "\nNow running the int 2 enum exec command across all input cols" colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, timeoutSecs=90, incrementingResult=False) print "\nexec colResultList", colResultList paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] # since we add the output twice, it's no longer colCount-1 y = colCount kwargs = {'y': y, 'max_iter': 50, 'case': 1} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' # only col y-1 (next to last)doesn't get renamed in coefficients # due to enum/categorical expansion print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(3) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(3)
def test_GLM_with_logit_result_1(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 5, 'cA', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, \ "using random coefficients and intercept and logit eqn. for output" (coefficients, intercept) = gen_rand_equation(colCount, SEEDPERFILE) print coefficients, intercept write_syn_dataset(csvPathname, rowCount, colCount, coefficients, intercept, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'y': y, 'max_iter': 60, 'lambda': 1e-4, 'alpha': 0, 'weight': 1.0, 'n_folds': 3, 'beta_epsilon': 1e-4, 'thresholds': 0.5, } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, 0, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(5)
def test_many_cols_real(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 100, 'cA', 300), (1000, 200, 'cB', 300), (1000, 300, 'cC', 300), (1000, 400, 'cD', 300), (1000, 500, 'cE', 300), (1000, 1000, 'cJ', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'y': y, 'max_iter': 50, 'case': '1', 'case_mode': '=', 'lambda': 1e-4, 'alpha': 0.6 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5) # try new offset/view inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], offset=100, view=100)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt' ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=120) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_hdfs_YearPredictionMSD(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] # a browser window too, just because we can h2b.browseTheCloud() validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir h2i.setupImportHdfs() parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=60) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=500, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_categorical_expand_and_probability_output(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (7919, 53, 'cA', 600), # translated to enums, 4 per col, so don't go above 2k effective cols or too slow! (2659, 400, 'cB', 600), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) print "\nUpload and parse", csvPathname parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=240, retryDelaySecs=0.5) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] y = colCount kwargs = {'y': y, 'max_iter': 20} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' # only col Y-1 (next to last)doesn't get renamed in coefficients due to enum/categorical expansion print "y:", y # FIX! bug was dropped coefficients if constant column is dropped ### h2o_glm.simpleCheckGLM(self, glm, Y-2, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_short(self): csvFilename = 'part-00000b' ### csvFilename = 'short' importFolderPath = '/home/hduser/data' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) start = time.time() # hardwire TAB as a separator, as opposed to white space (9) parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=500, separator=9) print "Parse of", parseKey['destination_key'], "took", time.time() - start, "seconds" print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=500) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # num_rows = inspect['num_rows'] # num_cols = inspect['num_cols'] keepPattern = "oly_|mt_|b_" y = "is_purchase" print "y:", y # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'x': x, 'y': y, # 'case_mode': '>', # 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 100, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "glm completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_loop_random_param_covtype(self): csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) paramDict = define_params() print "\nUsing random seed:", SEED for trial in range(5): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 5, 'n_folds': 1, 'family': "poisson", 'alpha': 0.0, 'lambda': 0, 'beta_epsilon': 0.001, 'max_iter': 3, 'standardize': 1, 'expert': 1, 'lsm_solver': 'GenGradient', } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds'] * 30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_many_cols_tridist(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 20, 'cB', 300), (10000, 30, 'cC', 300), (10000, 40, 'cD', 300), (10000, 50, 'cE', 300), (10000, 60, 'cF', 300), (10000, 70, 'cG', 300), (10000, 80, 'cH', 300), (10000, 90, 'cI', 300), (10000, 100, 'cJ', 300), (10000, 200, 'cK', 300), (10000, 300, 'cL', 300), (10000, 400, 'cM', 300), (10000, 500, 'cN', 300), (10000, 600, 'cO', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "\nParse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename paramDict2 = {} for k in paramDict: paramDict2[k] = paramDict[k][0] y = colCount kwargs = {'y': y} kwargs.update(paramDict2) start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, 8, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(5)
def test_GLM_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (10000, 100, 'cA', 100), (10000, 200, 'cB', 200), (10000, 300, 'cC', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount kwargs = { 'y': y, 'max_iter': 50, 'case': 1, 'family': 'binomial', 'lambda': 0, 'alpha': 0, 'max_iter': 50, 'weight': 1.0, 'thresholds': 0.5, 'n_folds': 2, 'beta_epsilon':1.0E-4, } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "y:", y h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if not h2o.browse_disable: h2b.browseJsonHistoryAsUrlLastMatch("GLM") time.sleep(10) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") time.sleep(10)
def test_GLM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if localhost: tryList = [ (10000, 100, 'cA', 300), (10000, 1000, 'cB', 300), (10000, 3000, 'cC', 500), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, 'cD', 300), (10000, 200, 'cE', 300), (10000, 300, 'cF', 300), (10000, 400, 'cG', 300), (10000, 500, 'cH', 300), (10000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename y = colCount # normally we dno't create x and rely on the default # create the big concat'ed x like the browser, to see what happens x = ','.join(map(str, range(colCount))) kwargs = {'x': x, 'y': y, 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)