def test_GLM_mnist_reals(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0,1,2,3,4,5,6,7,8,9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, "cD", 300), (n, 2, "cE", 300), (n, 4, "cF", 300), (n, 8, "cG", 300), (n, 16, "cH", 300), (n, 32, "cI", 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = "2c" # comma colSepChar = colSepHexString.decode("hex") colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = "0a" # newline rowSepChar = rowSepHexString.decode("hex") print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv" csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset( csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar ) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset( csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, ) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt ) print csvFilename, "parse time:", parseResult["response"]["time"] print "Parse result['destination_key']:", parseResult["destination_key"] print "\n" + csvFilename ( missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict, ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True) y = colCount kwargs = { "y": y, "max_iter": 200, "family": "binomial", "n_folds": 10, "alpha": 0, "lambda": 0, "thresholds": 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList = [ {"alpha": 0.5, "lambda": 1e-4}, {"alpha": 0.25, "lambda": 1e-6}, {"alpha": 0.0, "lambda": 1e-8}, {"alpha": 0.5, "lambda": 0.0}, {"alpha": 0.0, "lambda": 0.0}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" GLMModel = glm["GLMModel"] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel["iterations"] modelKey = GLMModel["model_key"] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # if iterations > 20: # raise Exception("Why take so many iterations: %s in this glm training?" % iterations) parseResult = h2i.import_parse( path=csvScorePathname, schema="put", hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt ) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore( key=parseResult["destination_key"], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs ) print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" ### print h2o.dump_json(glmScore) classErr = glmScore["validation"]["classErr"] auc = glmScore["validation"]["auc"] err = glmScore["validation"]["err"] nullDev = glmScore["validation"]["nullDev"] resDev = glmScore["validation"]["resDev"] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation["resDev"]) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
def test_GLM_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'y': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, 'thresholds': 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-8 }, { 'alpha': 0.5, 'lambda': 0.0 }, { 'alpha': 0.0, 'lambda': 0.0 }, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' GLMModel = glm['GLMModel'] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel['iterations'] modelKey = GLMModel['model_key'] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # if iterations > 20: # raise Exception("Why take so many iterations: %s in this glm training?" % iterations) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore( key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ( "nullDev:\t", nullDev)
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) y = colCount kwargs = { 'y': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, 'thresholds': 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-8}, {'alpha': 0.5, 'lambda': 0.0}, {'alpha': 0.0, 'lambda': 0.0}, ] # Try each one h2o.beta_features = True for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' GLMModel = glm['GLMModel'] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel['iterations'] modelKey = GLMModel['model_key'] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iterations > 20: raise Exception("Why take so many iterations: %s in this glm training?" % iterations) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
def test_GLM_mnist_reals(self): importFolderPath = "mnist" csvFilelist = [("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600)] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** csvPathname = importFolderPath + "/" + testCsvFilename testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse( path=csvPathname, schema="hdfs", hex_key=testKey, key2=testKey, timeoutSecs=timeoutSecs ) elapsed = time.time() - start print "parse end on ", testCsvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult["destination_key"], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() csvPathname = importFolderPath + "/" + trainCsvFilename parseResult = h2i.import_parse( path=csvPathname, schema="hdfs", hex_key=trainKey, key2=trainKey, timeoutSecs=timeoutSecs ) elapsed = time.time() - start print "parse end on ", trainCsvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult["destination_key"], timeoutSecs=300) print "x:", x params = { "x": x, "y": y, "case_mode": "=", "case": 0, "family": "binomial", "lambda": 1.0e-5, "alpha": 0.0, "max_iter": 5, "thresholds": 0.5, "n_folds": 1, "weight": 1, "beta_epsilon": 1.0e-4, } for c in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs["case"] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm["GLMModel"] modelKey = GLMModel["model_key"] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_mnist(self): importFolderPath = "mnist" csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0,1,2,3,4,5,6,7,8,9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0,1,2,3,4,5,6,7,8,9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) print "x:", x params = { 'x': x, 'y': y, 'case_mode': '=', 'case': 0, 'family': 'binomial', 'lambda': 1.0E-5, 'alpha': 0.0, 'max_iter': 5, 'thresholds': 0.5, 'n_folds': 1, 'weight': 1, 'beta_epsilon': 1.0E-4, } for c in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_enums_unbalanced(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y+1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.5, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 0}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] if not validation or 'avg_err' not in validation: raise Exception("glm: %s" % h2o.dump_json(glm) + \ "\nNo avg_err in validation." + \ "\nLikely if you look back, the job was cancelled, so there's no cross validation.") avg_err = validation['avg_err'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'avg_err', avg_err print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception("Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) if 1==0: # stuff from GLM1 classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "score classErr:", classErr print "score err:", err print "score auc:", auc print "score resDev:", resDev print "score nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y + 1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.5, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 0 }, ] # Try each one h2o.beta_features = True for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5, errorIfCancelled=True) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) print "glm2 end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] if not validation or 'avg_err' not in validation: raise Exception("glm: %s" % h2o.dump_json(glm) + \ "\nNo avg_err in validation." + \ "\nLikely if you look back, the job was cancelled, so there's no cross validation.") avg_err = validation['avg_err'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'avg_err', avg_err print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception( "Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess( pctWrong, 8, "Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) if 1 == 0: # stuff from GLM1 classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "score classErr:", classErr print "score err:", err print "score auc:", auc print "score resDev:", resDev print "score nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual( auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ( "nullDev:\t", nullDev)
def test_GLM2_mnist(self): h2o.beta_features = True if DO_HDFS: importFolderPath = "mnist" bucket = None schema = 'hdfs' else: importFolderPath = "mnist" bucket = 'home-0xdiag-datasets' schema = 'local' csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 'C0' # first column is pixel value print "y:" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) # PARSE train**************************************** trainKey = trainCsvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + testCsvFilename start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # GLM**************************************** print "This is the pruned x we'll use" ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) print "ignoreX:", ignoreX params = { 'ignored_cols': ignoreX, 'response': y, 'case_mode': '=', 'case_val': 0, 'family': 'binomial', 'lambda': 0.5, 'alpha': 1e-4, 'max_iter': 5, ## 'thresholds': 0.5, ## 'weight': 1.0, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } if DO_ALL_DIGITS: cases = [0,1,2,3,4,5,6,7,8,9] else: cases = [8] for c in cases: kwargs = params.copy() print "Trying binomial with case:", c kwargs['case_val'] = c timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] start = time.time() glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60) elapsed = time.time() - start print "GLMScore in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, "cD", 300), (n, 2, "cE", 300), (n, 4, "cF", 300), (n, 8, "cG", 300), (n, 16, "cH", 300), (n, 32, "cI", 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = "2c" # comma colSepChar = colSepHexString.decode("hex") colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = "0a" # newline rowSepChar = rowSepHexString.decode("hex") print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv" csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset( csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar ) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset( csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, ) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt ) print csvFilename, "parse time:", parseResult["response"]["time"] print "Parse result['destination_key']:", parseResult["destination_key"] print "\n" + csvFilename ( missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict, ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse( path=csvScorePathname, schema="put", hex_key=testDataKey, timeoutSecs=30, separator=colSepInt ) y = colCount modelKey = "glm_model" kwargs = { "standardize": 0, "destination_key": modelKey, "response": "C" + str(y + 1), "max_iter": 200, "family": "binomial", "n_folds": 0, "alpha": 0, "lambda": 0, } start = time.time() updateList = [ {"alpha": 0.5, "lambda": 1e-4}, {"alpha": 0.25, "lambda": 1e-6}, {"alpha": 0.0, "lambda": 1e-12}, {"alpha": 0.5, "lambda": 1e-12}, {"alpha": 0.0, "lambda": 1e-12}, {"alpha": 0.0, "lambda": 0}, ] # Try each one h2o.beta_features = True for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM( parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, noPoll=True, **kwargs ) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5, errorIfCancelled=True) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) print "glm2 end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" glm_model = glm["glm_model"] _names = glm_model["_names"] modelKey = glm_model["_key"] coefficients_names = glm_model["coefficients_names"] submodels = glm_model["submodels"][0] beta = submodels["beta"] norm_beta = submodels["norm_beta"] iteration = submodels["iteration"] validation = submodels["validation"] if not validation or "avg_err" not in validation: raise Exception( "glm: %s" % h2o.dump_json(glm) + "\nNo avg_err in validation." + "\nLikely if you look back, the job was cancelled, so there's no cross validation." ) avg_err = validation["avg_err"] auc = validation["auc"] aic = validation["aic"] null_deviance = validation["null_deviance"] residual_deviance = validation["residual_deviance"] print "_names", _names print "coefficients_names", coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print "beta", beta print "iteration", iteration print "avg_err", avg_err print "auc", auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception("Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key predictKey = "Predict.hex" start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs ) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual="C" + str(y), predict=predictKey, vpredict="predict" ) cm = predictCMResult["cm"] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess(pctWrong, 8, "Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) if 1 == 0: # stuff from GLM1 classErr = glmScore["validation"]["classErr"] auc = glmScore["validation"]["auc"] err = glmScore["validation"]["err"] nullDev = glmScore["validation"]["nullDev"] resDev = glmScore["validation"]["resDev"] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "score classErr:", classErr print "score err:", err print "score auc:", auc print "score resDev:", resDev print "score nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation["resDev"]) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)