def test_rf_multinomial_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_multinomial.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 400 colCount = 7 for trial in range (5): write_syn_dataset(csvPathname, totalRows, colCount, headerData) # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hexKey = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey, doSummary=True) start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) modelKey = rfView['drf_model']['_key'] h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=colCount+1, vpredict=1, expectedAuc=0.5, doAUC=False) h2b.browseJsonHistoryAsUrlLastMatch("RF")
def glm_score(self, csvFilename, bucket, csvPathname, modelKey, modelPathname, timeoutSecs=30, pollTimeoutSecs=30): print "\nStarting GLM score of", csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs) y = "10" # save and restore the model h2o.nodes[0].save_model(model=modelKey, path=modelPathname, force=1) # FIX! should we remove the existing key to make sure it loads? really should try both cases (existing or not) h2o.nodes[0].load_model(path=modelPathname) start = time.time() glmScore = h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5, doAUC=False) print "GLMScore in", (time.time() - start), "secs (python)" h2o.verboseprint(h2o.dump_json(glmScore)) # compare this glm to the first one. since the files are replications, # the results # should be similar? # UPDATE: format for returning results is slightly different than normal GLM if self.glmScore1: h2o_glm.compareToFirstGlm(self, 'mse', glmScore, self.glmScore1) else: self.glmScore1 = copy.deepcopy(glmScore)
def test_rf_multinomial_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_multinomial.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON" totalRows = 400 colCount = 7 for trial in range(5): write_syn_dataset(csvPathname, totalRows, colCount, headerData) # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hexKey = csvFilename + "_" + str(trial) + ".hex" ntree = 2 kwargs = { 'ntrees': ntree, 'mtries': None, 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': None, 'nbins': 1024, 'seed': 784834182943470027, } parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey, doSummary=True) start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs) print "trial #", trial, 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) modelKey = rfView['drf_model']['_key'] h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=colCount + 1, vpredict=1, expectedAuc=0.5, doAUC=False) h2b.browseJsonHistoryAsUrlLastMatch("RF")
def test_rf_enums_mappings(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' # limit depth and number of trees to accentuate the issue with categorical split decisions # use mtries so both look at all cols at every split? doesn't matter for speedrf # does speedrf try one more time? with 3 cols, mtries=2, so another try might # get a look at the missing col # does matter for drf2. does it "just stop" # trying mtries always looking at all columns or 1 col might be interesting if SPEEDRF: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'ntrees': 1, 'max_depth': 100, # 'oobee': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'mtries': COLS, } elif GBM: kwargs = { 'destination_key': modelKey, 'response': y, 'validation': scoreDataKey, 'seed': 123456789, # 'learn_rate': .1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'classification': 1, } else: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'nbins': 1024, 'mtries': COLS, } for r in range(2): start = time.time() if GBM: gbmResult = h2o_cmd.runGBM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "gbm end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' # print h2o.dump_json(gbmResult) (classification_error, classErrorPctList, totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult) elif SPEEDRF: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "speedrf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) else: rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_GLM2_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'xyz' kwargs = { 'n_folds': 0, 'destination_key': modelKey, 'response': y, 'max_iter': 200, 'family': 'binomial', 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-5 }, # {'alpha': 0.25, 'lambda': 1e-4}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex", timeoutSecs=30, separator=colSepInt) h2o_cmd.runScore(dataKey="B.hex", modelKey=modelKey, vactual='C' + str(y + 1), vpredict=1, expectedAuc=0.45)
def test_rf_enums_score_superset_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 3000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for rf model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' ntrees = 5 kwargs = { 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': ntrees, 'validation': scoreDataKey, } start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees) predictKey = 'Predict.hex' h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_rf_many_rooz_enums_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() if 1==0 and localhost: n = 4000 tryList = [ (n, 999, 'cI', 300), ] else: n = 100 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # can randomly pick the row and col cases. ### colSepCase = random.randint(0,1) colSepCase = 1 # using the comma is nice to ensure no craziness if (colSepCase==0): colSepHexString = '01' else: colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0,1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase==0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) if DO_TEN_INTEGERS: csvFilename = 'syn_rooz_int10_' + str(rowCount) + 'x' + str(colCount) + '.csv' else: csvFilename = 'syn_rooz_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename # we allow some NAs in the list above (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],exceptionOnMissingValues=False) y = colCount ntrees = 5 kwargs = { 'response': y, 'classification': 1, 'ntrees': ntrees, } start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", csvPathname, 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees) modelKey = rfResult['drf_model']['_key'] h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=colCount, vpredict=1, expectedAuc=0.5, doAUC=False)
def test_GLM2_enums_score_subset(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 500 tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'response': y, 'max_iter': 8, 'family': 'binomial', 'n_folds': 2, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' modelKey = glm['glm_model']['_key'] h2o_cmd.runScore(dataKey="score_" + hex_key, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.6)
def test_rf_enums_mappings(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (ROWS, COLS, "cI", 300), (ROWS, COLS, "cI", 300), (ROWS, COLS, "cI", 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = "2c" # comma colSepChar = colSepHexString.decode("hex") colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = "0a" # newline rowSepChar = rowSepHexString.decode("hex") print "rowSepChar:", rowSepChar csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv" csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset( csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN, ) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset( csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE, ) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse( path=csvScorePathname, schema="put", hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt ) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt ) print "Parse result['destination_key']:", parseResult["destination_key"] print "\n" + csvFilename ( missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict, ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True) y = colCount modelKey = "enums" # limit depth and number of trees to accentuate the issue with categorical split decisions # use mtries so both look at all cols at every split? doesn't matter for speedrf # does speedrf try one more time? with 3 cols, mtries=2, so another try might # get a look at the missing col # does matter for drf2. does it "just stop" # trying mtries always looking at all columns or 1 col might be interesting if SPEEDRF: kwargs = { "sample_rate": 0.999, "destination_key": modelKey, "response": y, "ntrees": 1, "max_depth": 100, # 'oobee': 1, "validation": hex_key, # 'validation': scoreDataKey, "seed": 123456789, "mtries": COLS, } elif GBM: kwargs = { "destination_key": modelKey, "response": y, "validation": scoreDataKey, "seed": 123456789, # 'learn_rate': .1, "ntrees": 1, "max_depth": 100, "min_rows": 1, "classification": 1, } else: kwargs = { "sample_rate": 0.999, "destination_key": modelKey, "response": y, "classification": 1, "ntrees": 1, "max_depth": 100, "min_rows": 1, "validation": hex_key, # 'validation': scoreDataKey, "seed": 123456789, "nbins": 1024, "mtries": COLS, } for r in range(2): start = time.time() if GBM: gbmResult = h2o_cmd.runGBM( parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs ) print "gbm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" # print h2o.dump_json(gbmResult) (classification_error, classErrorPctList, totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult) elif SPEEDRF: rfResult = h2o_cmd.runSpeeDRF( parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs ) print "speedrf end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) else: rfResult = h2o_cmd.runRF( parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs ) print "rf end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore( dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL ) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_rf_enums_score_superset_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 3000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for rf model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' ntrees = 5 kwargs = { 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': ntrees, 'validation': scoreDataKey, } start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees) predictKey = 'Predict.hex' h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True, expectedAuc=0.5): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial%10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' response = "C55" h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, outputCol=numCols-1, changeToBinomial=not DO_MULTINOMIAL) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str(trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that) rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, validation=dataKeyTest, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest, modelKey=model_key, vactual=response, vpredict=1, expectedAuc=expectedAuc) print h2o.dump_json(rfvScoring) h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) print "hello7" (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp return rfvScoring
def test_GLM2_enums_score_subset(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 500 tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = {'response': y, 'max_iter': 8, 'family': 'binomial', 'n_folds': 2, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' modelKey = glm['glm_model']['_key'] h2o_cmd.runScore(dataKey="score_" + hex_key, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_GLM_enums_unbalanced(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y+1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.5, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 0}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception("Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key h2o_cmd.runScore(dataKey=testDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_rf_enums_mappings_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 3000 tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' # limit depth and number of trees to accentuate the issue with categorical split decisions if SPEEDRF: kwargs = { 'destination_key': modelKey, 'response': y, 'num_trees': 1, 'max_depth': 100, 'oobee': 1, 'seed': 123456789, } else: kwargs = { 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'validation': scoreDataKey, 'seed': 123456789, } for r in range(4): start = time.time() if SPEEDRF: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) else: rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' # print h2o.dump_json(rfResult) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_GLM_enums_unbalanced(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y + 1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.5, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 0 }, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm2 end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception( "Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key h2o_cmd.runScore(dataKey=testDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_GLM2_ints_unbalanced(self): h2o.beta_features = True ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'xyz' kwargs = { 'n_folds': 0, 'destination_key': modelKey, 'response': y, 'max_iter': 200, 'family': 'binomial', 'alpha': 0, 'lambda': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-5}, # {'alpha': 0.25, 'lambda': 1e-4}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex", timeoutSecs=30, separator=colSepInt) h2o_cmd.runScore(dataKey="B.hex", modelKey=modelKey, vactual='C' + str(y+1), vpredict=1, expectedAuc=0.6)
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True, expectedAuc=0.5): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial % 10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' response = "C55" h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, outputCol=numCols - 1, changeToBinomial=not DO_MULTINOMIAL) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str( trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that) rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, validation=dataKeyTest, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest, modelKey=model_key, vactual=response, vpredict=1, expectedAuc=expectedAuc) print h2o.dump_json(rfvScoring) h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) print "hello7" (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / numRows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp return rfvScoring