def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # Took num_cross_validation_folds out, because GLM doesn't include num_cross_validation_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'x': x, 'y': y, 'case': -1} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = ["covtype.data"] else: csvFilenameList = [ "covtype200x.data", "covtype200x.data", "covtype.data", "covtype.data", "covtype20x.data", "covtype20x.data", ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = "/home/0xdiag/datasets/standard" validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60) # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, "parse time:", parseKey["response"]["time"] print "Parse result['destination_key']:", parseKey["destination_key"] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey["destination_key"]) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm["GLMModel"] coefficients = GLMModel["coefficients"] validationsList = GLMModel["validations"] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, "err", validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write(".") sys.stdout.flush()
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30) y = "10" x = "" # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values 1,-1. need to specify case kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python)" h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def glm_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) y = "10" x = "" # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'x': x, 'y': y, 'case': -1} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] validations = validationsList[0] # validations['err'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=10) y = 10 # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'response': y, 'alpha': 0, 'family': 'binomial'} h2o.nodes[0].to_enum(src_key=parseResult['destination_key'], column_index=y + 1) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? glm_model = glm['glm_model'] validation = glm_model['submodels'][0]['validation'] if self.validation1: h2o_glm.compareToFirstGlm(self, 'auc', validation, self.validation1) else: self.validation1 = copy.deepcopy(validation)
def glm_score(self, csvFilename, bucket, csvPathname, modelKey, modelPathname, timeoutSecs=30, pollTimeoutSecs=30): print "\nStarting GLM score of", csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs) y = "10" # save and restore the model h2o.nodes[0].save_model(model=modelKey, path=modelPathname, force=1) # FIX! should we remove the existing key to make sure it loads? really should try both cases (existing or not) h2o.nodes[0].load_model(path=modelPathname) start = time.time() glmScore = h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5, doAUC=False) print "GLMScore in", (time.time() - start), "secs (python)" h2o.verboseprint(h2o.dump_json(glmScore)) # compare this glm to the first one. since the files are replications, # the results # should be similar? # UPDATE: format for returning results is slightly different than normal GLM if self.glmScore1: h2o_glm.compareToFirstGlm(self, 'mse', glmScore, self.glmScore1) else: self.glmScore1 = copy.deepcopy(glmScore)
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=10) y = 10 # Took n_folds out, because GLM doesn't include n_folds time and it's slow # wanted to compare GLM time to my measured time # hastie has two values, 1 and -1. need to use case for one of them kwargs = {'response': y, 'alpha': 0, 'family': 'binomial'} h2o.nodes[0].to_enum(src_key=parseResult['destination_key'], column_index=y+1) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "GLM in", (time.time() - start), "secs (python measured)" h2o_glm.simpleCheckGLM(self, glm, "C8", **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? glm_model = glm['glm_model'] validation = glm_model['submodels'][0]['validation'] if self.validation1: h2o_glm.compareToFirstGlm(self, 'auc', validation, self.validation1) else: self.validation1 = copy.deepcopy(validation)
def glm_score(self, csvFilename, csvPathname, modelKey, thresholds="0.5", timeoutSecs=30, pollTimeoutSecs=30): print "\nStarting GLM score of", csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs) y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "GLMScore in", (time.time() - start), "secs (python)" h2o.verboseprint(h2o.dump_json(glmScore)) ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, # the results # should be similar? # UPDATE: format for returning results is slightly different than normal GLM validation = glmScore['validation'] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validation, self.validations1) else: self.validations1 = copy.deepcopy(validation)
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can ## h2b.browseTheCloud() importFolderPath = "standard" validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_catdata_hosts(self): # these are still in /home/kevin/scikit/datasets/logreg # FIX! just two for now.. csvFilenameList = [ "1_100kx7_logreg.data.gz", "2_100kx7_logreg.data.gz" ] # pop open a browser on the cloud h2b.browseTheCloud() # save the first, for all comparisions, to avoid slow drift with each iteration validations1 = {} for csvFilename in csvFilenameList: csvPathname = h2o.find_file('smalldata/' + csvFilename) # I use this if i want the larger set in my localdir # csvPathname = h2o.find_file('/home/kevin/scikit/datasets/logreg/' + csvFilename) print "\n" + csvPathname start = time.time() # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included # 0 by itself is okay? kwargs = { 'y': 7, 'x': '1,2,3,4,5,6', 'family': "binomial", 'n_folds': 3, 'lambda': 1e-4 } timeoutSecs = 200 glm = h2o_cmd.runGLM(csvPathname=csvPathname, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 6, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] print validationsList validations = validationsList[0] # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) sys.stdout.write('.') sys.stdout.flush()
def test_GLM2_from_import_hosts(self): h2o.beta_features = True csvFilenameList = [ 'covtype.data', 'covtype20x.data', ] # a browser window too, just because we can ## h2b.browseTheCloud() importFolderPath = "standard" validation1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key='A.hex', timeoutSecs=2000) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word case = 1 y = 54 execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) kwargs = {'response': y, 'n_folds': 2, 'family': "binomial"} glm = h2o_cmd.runGLM(parseResult={'destination_key': 'A.hex'}, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? validation = glm['glm_model']['submodels'][0]['validation'] if validation1: h2o_glm.compareToFirstGlm(self, 'auc', validation, validation1) else: validation1 = copy.deepcopy(validation)
def test_GLM2_catdata_hosts(self): h2o.beta_features = True # these are still in /home/kevin/scikit/datasets/logreg # FIX! just two for now.. csvFilenameList = [ "1_100kx7_logreg.data.gz", "2_100kx7_logreg.data.gz" ] # pop open a browser on the cloud ### h2b.browseTheCloud() # save the first, for all comparisions, to avoid slow drift with each iteration validation1 = {} for csvFilename in csvFilenameList: csvPathname = csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') print "\n" + csvPathname start = time.time() # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included # 0 by itself is okay? kwargs = { 'response': 7, 'family': "binomial", 'n_folds': 3, 'lambda': 1e-4 } timeoutSecs = 200 glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 'C7', **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") # compare this glm to the first one. since the files are replications, the results # should be similar? validation = glm['glm_model']['submodels'][0]['validation'] if validation1: h2o_glm.compareToFirstGlm(self, 'auc', validation, validation1) else: validation1 = copy.deepcopy(validation)
def test_GLM_catdata_hosts(self): # these are still in /home/kevin/scikit/datasets/logreg # FIX! just two for now.. csvFilenameList = [ "1_100kx7_logreg.data.gz", "2_100kx7_logreg.data.gz" ] # pop open a browser on the cloud h2b.browseTheCloud() # save the first, for all comparisions, to avoid slow drift with each iteration validations1 = {} for csvFilename in csvFilenameList: csvPathname = h2o.find_file('smalldata/' + csvFilename) # I use this if i want the larger set in my localdir # csvPathname = h2o.find_file('/home/kevin/scikit/datasets/logreg/' + csvFilename) print "\n" + csvPathname start = time.time() # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included # 0 by itself is okay? kwargs = {'y': 7, 'x': '1,2,3,4,5,6', 'family': "binomial", 'num_cross_validation_folds': 3, 'lambda': 1e-4} timeoutSecs = 200 glm = h2o_cmd.runGLM(csvPathname=csvPathname, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 6, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] print validationsList validations = validationsList[0] # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) sys.stdout.write('.') sys.stdout.flush()
def test_GLM2_from_import_hosts(self): h2o.beta_features = True csvFilenameList = [ 'covtype.data', 'covtype20x.data', ] # a browser window too, just because we can ## h2b.browseTheCloud() importFolderPath = "standard" validation1= {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key='A.hex', timeoutSecs=2000) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word case = 1 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) kwargs = {'response': y, 'n_folds': 2, 'family': "binomial"} glm = h2o_cmd.runGLM(parseResult={'destination_key': 'A.hex'}, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? validation = glm['glm_model']['submodels'][0]['validation'] if validation1: h2o_glm.compareToFirstGlm(self, 'auc', validation, validation1) else: validation1 = copy.deepcopy(validation)
def test_GLM_catdata_hosts(self): # these are still in /home/kevin/scikit/datasets/logreg # FIX! just two for now.. csvFilenameList = [ "1_100kx7_logreg.data.gz", "2_100kx7_logreg.data.gz" ] # pop open a browser on the cloud ### h2b.browseTheCloud() # save the first, for all comparisions, to avoid slow drift with each iteration validations1 = {} for csvFilename in csvFilenameList: csvPathname = csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') print "\n" + csvPathname start = time.time() # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included # 0 by itself is okay? kwargs = {'y': 7, 'x': '1,2,3,4,5,6', 'family': "binomial", 'n_folds': 3, 'lambda': 1e-4} timeoutSecs = 200 glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 'C7', **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] validationsList = glm['GLMModel']['validations'] print validationsList validations = validationsList[0] # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations)
def test_GLM2_catdata_hosts(self): h2o.beta_features = True # these are still in /home/kevin/scikit/datasets/logreg # FIX! just two for now.. csvFilenameList = [ "1_100kx7_logreg.data.gz", "2_100kx7_logreg.data.gz" ] # pop open a browser on the cloud ### h2b.browseTheCloud() # save the first, for all comparisions, to avoid slow drift with each iteration validation1 = {} for csvFilename in csvFilenameList: csvPathname = csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') print "\n" + csvPathname start = time.time() # FIX! why can't I include 0 here? it keeps getting 'unable to solve" if 0 is included # 0 by itself is okay? kwargs = {'response': 7, 'family': "binomial", 'n_folds': 3, 'lambda': 1e-4} timeoutSecs = 200 glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, 'C7', **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") # compare this glm to the first one. since the files are replications, the results # should be similar? validation = glm['glm_model']['submodels'][0]['validation'] if validation1: h2o_glm.compareToFirstGlm(self, 'auc', validation, validation1) else: validation1 = copy.deepcopy(validation)
def glm_score(self, csvFilename, csvPathname, modelKey, timeoutSecs=3): print "\nStarting GLM score of", csvFilename key2 = csvFilename + ".hex" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10) y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} start = time.time() glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, timeoutSecs=timeoutSecs) print "GLMScore in", (time.time() - start), "secs (python)" h2o.verboseprint(h2o.dump_json(glmScore)) ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs) # compare this glm to the first one. since the files are replications, the results # should be similar? GLMModel = glmScore['GLMModel'] validationsList = glmScore['GLMModel']['validations'] validations = validationsList[0] if self.validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1) else: self.validations1 = copy.deepcopy(validations)
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename # we can force a col type to enum now? with param columnTypes # "Numeric" # make the last column enum # Instead of string for parse, make this a dictionary, with column index, value # that's used for updating the ColumnTypes array before making it a string for parse columnTypeDict = {10: 'Enum'} parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList for i in range(10): print "Summary on column", i # FIX! how come only 0 works here for column co = h2o_cmd.runSummary(key=parse_key, column=i) for k,v in co: print k, v expected = [] allowedDelta = 0 labelListUsed = list(labelList) labelListUsed.remove('C11') numColsUsed = numCols - 1 parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': 'C11', # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, 'n_folds': 1, } start = time.time() model_key = 'hastie_glm.hex' bmResult = h2o.n0.build_model( algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') # compare this glm to the first one. since the files are replications, the results # should be similar? if self.validation1: h2o_glm.compareToFirstGlm(self, 'AUC', validation, self.validation1) else: # self.validation1 = copy.deepcopy(validation) self.validation1 = None
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_GLM_from_import_hosts(self): if localhost: csvFilenameList = [ 'covtype.data', ] else: csvFilenameList = [ 'covtype200x.data', 'covtype200x.data', 'covtype.data', 'covtype.data', 'covtype20x.data', 'covtype20x.data', ] # a browser window too, just because we can ## h2b.browseTheCloud() importFolderPath = "standard" validations1 = {} coefficients1 = {} for csvFilename in csvFilenameList: # have to re-import each iteration now, since the source key # is removed and if we re-parse it, it's not there csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=2000, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.verboseprint("\nglm:", glm) GLMModel = glm['GLMModel'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, 'C1', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30): print "\nStarting GLM of", csvFilename # we can force a col type to enum now? with param columnTypes # "Numeric" # make the last column enum # Instead of string for parse, make this a dictionary, with column index, value # that's used for updating the ColumnTypes array before making it a string for parse columnTypeDict = {10: 'Enum'} parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict, hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList for i in range(10): print "Summary on column", i # FIX! how come only 0 works here for column co = h2o_cmd.runSummary(key=parse_key, column=i) for k,v in co: print k, v expected = [] allowedDelta = 0 labelListUsed = list(labelList) labelListUsed.remove('C11') numColsUsed = numCols - 1 parameters = { 'validation_frame': parse_key, 'ignored_columns': None, # FIX! for now just use a column that's binomial 'response_column': 'C11', # FIX! when is this needed? redundant for binomial? 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'alpha': '[1e-4]', 'lambda': '[0.5,0.25, 0.1]', 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'use_all_factor_levels': False, } start = time.time() model_key = 'hastie_glm.hex' bmResult = h2o.n0.build_model( algo='glm', model_id=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') # compare this glm to the first one. since the files are replications, the results # should be similar? if self.validation1: h2o_glm.compareToFirstGlm(self, 'AUC', validation, self.validation1) else: # self.validation1 = copy.deepcopy(validation) self.validation1 = None