def test_50_nongz_fvec(self): avgMichalSize = 237270000 * 2 bucket = 'home-0xdiag-datasets' importFolderPath = "many_many" print "Using non-gz'ed files in", importFolderPath csvFilenameList = [ ("*.dat", "file_18_A.dat", 18 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg
def test_50_nongz_fvec(self): h2o.beta_features = True avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using non-gz'ed files in", importFolderPath csvFilenameList= [ ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg
def findXFromColumnInfo(key=None, keepList=None, timeoutSecs=120, noPrint=False): (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(key, exceptionOnMissingValues=False, max_column_display=99999999, timeoutSecs=timeoutSecs) num_cols = len(colNameDict) x = range(num_cols) # need to walk over a copy, cause we change x xOrig = x[:] for k in xOrig: name = colNameDict[k] if not name in keepList: if not noPrint: print "Removing %d because name: %s isn't in keepList %s" % (k, name, keepList) x.remove(k) if not noPrint: print "x has", len(x), "cols" strX = ",".join(map(str,x)) print "\nmatching keepList x:",strX return strX
def sub_c3_nongz_fvec_long(self, csvFilenameList): # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # output 378 can't be in this ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541] ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'response': 'C379', 'max_iter': 10, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial # execExpr="A.hex=%s" % parseResult['destination_key'] # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def goodXFromColumnInfo(y, num_cols=None, missingValuesDict=None, constantValuesDict=None, enumSizeDict=None, colTypeDict=None, colNameDict=None, keepPattern=None, key=None, timeoutSecs=120, forRF=False, noPrint=False): y = str(y) # if we pass a key, means we want to get the info ourselves here if key is not None: (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(key, exceptionOnMissingValues=False, max_column_display=99999999, timeoutSecs=timeoutSecs) num_cols = len(colNameDict) # now remove any whose names don't match the required keepPattern if keepPattern is not None: keepX = re.compile(keepPattern) else: keepX = None x = range(num_cols) # need to walk over a copy, cause we change x xOrig = x[:] ignore_x = [] # for use by RF for k in xOrig: name = colNameDict[k] # remove it if it has the same name as the y output if str(k)== y: # if they pass the col index as y if not noPrint: print "Removing %d because name: %s matches output %s" % (k, str(k), y) x.remove(k) # rf doesn't want it in ignore list # ignore_x.append(k) elif name == y: # if they pass the name as y if not noPrint: print "Removing %d because name: %s matches output %s" % (k, name, y) x.remove(k) # rf doesn't want it in ignore list # ignore_x.append(k) elif keepX is not None and not keepX.match(name): if not noPrint: print "Removing %d because name: %s doesn't match desired keepPattern %s" % (k, name, keepPattern) x.remove(k) ignore_x.append(k) # missing values reports as constant also. so do missing first. # remove all cols with missing values # could change it against num_rows for a ratio elif k in missingValuesDict: value = missingValuesDict[k] if not noPrint: print "Removing %d with name: %s because it has %d missing values" % (k, name, value) x.remove(k) ignore_x.append(k) elif k in constantValuesDict: value = constantValuesDict[k] if not noPrint: print "Removing %d with name: %s because it has constant value: %s " % (k, name, str(value)) x.remove(k) ignore_x.append(k) # this is extra pruning.. # remove all cols with enums, if not already removed elif k in enumSizeDict: value = enumSizeDict[k] if not noPrint: print "Removing %d %s because it has enums of size: %d" % (k, name, value) x.remove(k) ignore_x.append(k) if not noPrint: print "x has", len(x), "cols" print "ignore_x has", len(ignore_x), "cols" x = ",".join(map(str,x)) ignore_x = ",".join(map(str,ignore_x)) if not noPrint: print "\nx:", x print "\nignore_x:", ignore_x if forRF: return ignore_x else: return x
def test_rf_enums_mappings_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 3000 tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' # limit depth and number of trees to accentuate the issue with categorical split decisions if SPEEDRF: kwargs = { 'destination_key': modelKey, 'response': y, 'num_trees': 1, 'max_depth': 100, 'oobee': 1, 'seed': 123456789, } else: kwargs = { 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'validation': scoreDataKey, 'seed': 123456789, } for r in range(4): start = time.time() if SPEEDRF: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) else: rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' # print h2o.dump_json(rfResult) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_GLM_enums_score_subset(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, key2, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseKey[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) GLMModel = glm['GLMModel'] modelKey = GLMModel['model_key'] parseKey = h2o_cmd.parseFile(None, csvScorePathname, key2="score_" + key2, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseKey[ 'destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] print "classErr:", classErr print "err:", err print "auc:", auc
def test_parse_time(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename colCount = 6 rowCount = 10 headerData = rand_header(colCount) write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range(1): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key) print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsA = inspect['numRows'] numColsA = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsA, numRows=numRowsA, noPrint=True) print summaryResult h2o_cmd.infoFromSummary(summaryResult) (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictA or enumSizeDictA: raise Exception( "Should be empty? constantValuesDictA %s enumSizeDictA %s" % (constantValuesDictA, enumSizeDictA)) print "missingValuesListA", missingValuesListA # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty") self.assertEqual(numColsA, colCount) self.assertEqual(numRowsA, rowCount) # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) # interesting. what happens when we do csv download with time data? start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key) print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time( ) - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsB = inspect['numRows'] numColsB = inspect['numCols'] print "missingValuesListB", missingValuesListB summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsB, numRows=numRowsB, noPrint=True) (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictB or enumSizeDictB: raise Exception( "Should be empty? constantValuesDictB %s enumSizeDictB %s" % (constantValuesDictB, enumSizeDictB)) self.assertEqual( missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result" ) self.assertEqual( numColsA, numColsB, "numCols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # but in this dataset we have a header too, so the row counts should be equal # if not, maybe the parse of our dataset didn't detect a row self.assertEqual( numRowsA, numRowsB, "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB)) # FIX! should do some comparison of values? # maybe can use exec to checksum the columns and compare column list. # or compare to expected values? (what are the expected values for the number for time inside h2o?) # FIX! should compare the results of the two parses. The infoFromInspect result? ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_exec_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 3, 2, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] for j in range(CUT_EXPR_CNT): print "Creating", CUT_EXPR_CNT, 'cut expressions' # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column # cel = colEnumList[c] cel = colEnumList # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' && '.join(cutExprList) print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] rowExpr = '%s[%s,];' % (hex_key, cutExpr) print "rowExpr:", rowExpr rowExprList.append(rowExpr) print "j:", j # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # print h2o.dump_json(inspect) rSummary = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(rSummary) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==0: start = time.time() e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1)) elapsed = time.time() - start print "exec 1 took", elapsed, "seconds." execTime = elapsed if 1==1: start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." if 1==0: gKey = random.choice(eKeys) # do a 2nd random to see if things blow up start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey)) elapsed = time.time() - start print "exec 3 took", elapsed, "seconds." if 1==1: inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_quant_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if getpass.getuser() == 'kevin': tryList = [ (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] else: tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] # h2b.browseTheCloud() trial = 0 for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList: xList = [] eList = [] fList = [] # PARSE******************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False) csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] if not oColCount: iColCount = 0 if not oColCount: oColCount = numCols colCount = iColCount + oColCount for i in range(0, numCols): print "Column", i, "summary" h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i) # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) # print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict # start after the last input col levels = h2o.nodes[0].levels(source=hex_key) l = levels['levels'] for column in range(iColCount, iColCount + oColCount): if l[column]: print "Skipping", column, "because it's enum (says levels)" continue # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? start = time.time() # file has headers. use col index q = h2o.nodes[0].quantiles(source_key=hex_key, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] h2p.red_print("result:", q['result'], "quantile", quantile, "interpolated:", q['interpolated'], "iterations", q['iterations']) elapsed = time.time() - start print "quantile end on ", hex_key, 'took', elapsed, 'seconds.' quantileTime = elapsed # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() if 1 == 1: h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=column, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, use_genfromtxt=True, ) trial += 1 execTime = 0 xList.append(column) eList.append(execTime) fList.append(quantileTime) # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on took", elapsed, 'seconds.' #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'column (0 is first)' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def sub_c3_nongz_fvec_long(self, csvFilenameList): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution() # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") execExpr = 'B.hex=A.hex' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.checkKeyDistribution() execExpr = 'C.hex=B.hex' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.checkKeyDistribution() execExpr = 'D.hex=C.hex' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.checkKeyDistribution()
def test_GLM2_enums_score_subset(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 500 tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = {'response': y, 'max_iter': 8, 'family': 'binomial', 'n_folds': 2, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' modelKey = glm['glm_model']['_key'] h2o_cmd.runScore(dataKey="score_" + hex_key, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_GLM_enums_unbalanced(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y+1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.5, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 1e-12}, {'alpha': 0.0, 'lambda': 0}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception("Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key h2o_cmd.runScore(dataKey=testDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_GLM2_enums_score_superset(self): h2o.beta_features = True print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?" SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, "cD", 300), (n, 2, "cE", 300), (n, 3, "cF", 300), (n, 4, "cG", 300), (n, 5, "cH", 300), (n, 6, "cI", 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = "2c" # comma colSepChar = colSepHexString.decode("hex") colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = "0a" # newline rowSepChar = rowSepHexString.decode("hex") print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv" csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for glm model building" write_syn_dataset( csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar ) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset( csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, ) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt ) print "Parse result['destination_key']:", parseResult["destination_key"] print "\n" + csvFilename ( missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict, ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True) y = colCount modelKey = "enums" kwargs = { "destination_key": modelKey, "response": y, "max_iter": 1, "n_folds": 1, "alpha": 0.2, "lambda": 1e-5, "family": "binomial", } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse( path=csvScorePathname, schema="put", hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt ) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = "Predict.hex" start = time.time() predictResult = h2o_cmd.runPredict( data_key=scoreDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs ) # just get a predict and AUC on the same data. has to be binomial result resultAUC = h2o.nodes[0].generate_auc( thresholds=None, actual=scoreDataKey, predict="Predict.hex", vactual=y, vpredict=1 ) auc = resultAUC["AUC"] self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=scoreDataKey, predict=predictKey, vactual="C" + str(y + 1), vpredict="predict" ) cm = predictCMResult["cm"] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y + 1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.5, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 0 }, ] # Try each one h2o.beta_features = True for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5, errorIfCancelled=True) glm = h2o.nodes[0].glm_view(_modelKey=modelKey) print "glm2 end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] if not validation or 'avg_err' not in validation: raise Exception("glm: %s" % h2o.dump_json(glm) + \ "\nNo avg_err in validation." + \ "\nLikely if you look back, the job was cancelled, so there's no cross validation.") avg_err = validation['avg_err'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'avg_err', avg_err print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception( "Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) self.assertLess( pctWrong, 8, "Should see less than 7 pct error (class = 4): %s" % pctWrong) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) if 1 == 0: # stuff from GLM1 classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "score classErr:", classErr print "score err:", err print "score auc:", auc print "score resDev:", resDev print "score nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual( auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ( "nullDev:\t", nullDev)
def sub_c3_nongz_fvec_long(self, csvFilenameList): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]: x.remove(i) ignore_x.append(i) x.remove(378) # add one since we are no longer 0 based offset x = ",".join(map(lambda x: "C" + str(x+1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def sub_c3_fvec_long(self): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() avgMichalSize = 116561140 bucket = "home-0xdiag-datasets" ### importFolderPath = 'more1_1200_link' importFolderPath = "manyfiles-nflx-gz" print "Using .gz'ed files in", importFolderPath if len(h2o.nodes) == 1: csvFilenameList = [("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600)] else: csvFilenameList = [ ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 1800), ] if LOG_MACHINE_STATS: benchmarkLogging = ["cpu", "disk", "network"] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") importFullList = importResult["files"] importFailList = importResult["fails"] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse( bucket=bucket, path=csvPathname, schema="local", hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, ) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "Parse result['destination_key']:", parseResult["destination_key"] h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed msg = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed ) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541]: x.remove(i) ignore_x.append(i) x.remove(378) # add one since we are no longer 0 based offset x = ",".join(map(lambda x: "C" + str(x + 1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x)) GLMkwargs = { "ignored_cols": ignore_x, "response": "C379", "max_iter": 4, "n_folds": 1, "family": "binomial", "alpha": 0.2, "lambda": 1e-5, } # convert to binomial execExpr = "A.hex=%s" % parseResult["destination_key"] h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ("C379", "C379", 15) h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) aHack = {"destination_key": "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = "{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed ) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_parse_multi_header_rand(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(20): l = random.randint(1, 64) # random length headers headerName = ''.join( [random.choice(allowedLetters) for _ in range(l)]) headerChoices.append(headerName) # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ # FIX! one fails count for now # (1, 5, 9, 'cA', 60, 0), (1, 5, 9, 'cA', 60, 0), (1, 5, 25, 'cA', 60, 0), # try with col mismatch on header. # FIX! causes exception? don't test for now # (7, 300, 10, 'cA', 60, 0), # (7, 300, 10, 'cB', 60, 1), # (7, 300, 10, 'cC', 60, 2), # (7, 300, 10, 'cD', 60, 3), # (7, 300, 8, 'cA', 60, 0), # (7, 300, 8, 'cB', 60, 1), # (7, 300, 8, 'cC', 60, 2), # (7, 300, 8, 'cD', 60, 3), ] # so many random combos..rather than walk tryList, just do random for some amount of time for trial in range(50): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 # DATA_HAS_HDR_ROW = random.randint(0,1) DATA_HAS_HDR_ROW = 0 # PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = 0 ## DATA_FIRST_IS_COMMENT = random.randint(0,1) ## HEADER_FIRST_IS_COMMENT = random.randint(0,1) print "TEMPORARY: don't put any comments in" DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 # none is not legal # SEP_CHAR_GEN = random.choice(paramsDict['separator']) SEP_CHAR_GEN = "\t" print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', SEP_CHAR_GEN # they need to both use the same separator (h2o rule) hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"] print hh print "UPDATE: always use comma (space legal also?) for header separator?? it should work no matter what separator the data uses?" headerForHeader = ",".join(hh) # make these different hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"] headerForData = SEP_CHAR_GEN.join(hh) # random selection of parse param choices kwargs = {} for k, v in paramsDict.items(): aChoice = random.choice(v) # can tell h2o something different compared to what we actually used! if k == 'separator': if aChoice: sepChar = aChoice sepCharInt = ord(aChoice) # make it an integer for h2o else: sepChar = ',' # default char for None, need it for header/data file creation sepCharInt = None aChoice = sepCharInt kwargs[k] = aChoice # FOR NOW: ..override the rand choice if it exists, so we can parse and expect 'A' to be found # match what was gen'ed if choice is not None if kwargs['separator']: if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",": # parse doesn't auto-detect tab. will autodetect space and comma del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # create data files for fileN in range(fileNum): csvFilename = 'syn_data_' + str(fileN) + "_" + str( SEED) + "_" + str(trial) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset( csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone # create the header file hdrFilename = 'syn_header_' + str(SEED) + "_" + str( trial) + "_" + rowxcol + '.csv' hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset( hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) if PARSE_PATTERN_INCLUDES_HEADER: # only include header file data rows if the parse pattern includes it totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w xs = h2o.nodes[0].import_files(SYNDATASETS_DIR)['keys'] headerKey = [x for x in xs if hdrFilename in x][0] dataKey = [x for x in xs if csvFilename not in x][0] # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'syn_header': kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'syn_data': kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None print "If header_from_file= is used, we are currently required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = '*syn_*' + str(trial) + "_" + rowxcol + '*' else: pattern = '*syn_data_*' + str(trial) + "_" + rowxcol + '*' parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone == 0) and (kwargs['header'] == 1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData ### print (headerRowsDone!=0), (kwargs['header']==1), DATA_HAS_HDR_ROW, (kwargs['header_from_file'] is not None) if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows (header rows don't count) h2o: %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it h2oShouldSeeHeader = (HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)) or DATA_HAS_HDR_ROW if h2oShouldSeeHeader: kwargs = {'sample': 75, 'depth': 25, 'ntree': 1, 'ignore': 'A'} else: kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
def test_GLM_many_enums(self): SYNDATASETS_DIR = h2o.make_syn_dir() if not localhost: n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] else: n = 150 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # just randomly pick the row and col cases. colSepCase = random.randint(0,1) colSepCase = 1 # using the comma is nice to ensure no craziness if (colSepCase==0): colSepHexString = '01' quoteChars = ",\'\"" # more choices for the unquoted string else: colSepHexString = '2c' # comma quoteChars = "" colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0,1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase==0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, quoteChars=quoteChars) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = {'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0} start = time.time() ### glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
def test_rf_enums_score_superset_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 3000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for rf model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' ntrees = 5 kwargs = { 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': ntrees, 'validation': scoreDataKey, } start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees) predictKey = 'Predict.hex' h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def sub_c2_nongz_fvec_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx' print "Using non-gz'ed files in", importFolderPath csvFilenameList= [ ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern # double import still causing problems? # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # importFullList = importResult['files'] # importFailList = importResult['fails'] # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # remove the output too! (378) ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541] ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'family': 'binomial', 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") # convert to binomial execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_quant_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if getpass.getuser()=='kevin': tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), ] else: tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] # h2b.browseTheCloud() trial = 0 for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList: xList = [] eList = [] fList = [] # PARSE******************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False) csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] if not oColCount: iColCount = 0 if not oColCount: oColCount = numCols colCount = iColCount + oColCount for i in range (0,numCols): print "Column", i, "summary" h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i); # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict # start after the last input col levels = h2o.nodes[0].levels(source=hex_key); l = levels['levels'] for column in range(iColCount, iColCount+oColCount): if l[column]: print "Skipping", column, "because it's enum (says levels)" continue # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? start = time.time() # file has headers. use col index q = h2o.nodes[0].quantiles(source_key=hex_key, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] h2p.red_print("result:", q['result'], "quantile", quantile, "interpolated:", q['interpolated'], "iterations", q['iterations']) elapsed = time.time() - start print "quantile end on ", hex_key, 'took', elapsed, 'seconds.' quantileTime = elapsed # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() if 1==0: h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=column, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, use_genfromtxt=True, ) trial += 1 execTime = 0 xList.append(column) eList.append(execTime) fList.append(quantileTime) # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on took", elapsed, 'seconds.' #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'column (0 is first)' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def sub_c2_nongz_fvec_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx' print "Using non-gz'ed files in", importFolderPath csvFilenameList = [ ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern # double import still causing problems? # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # importFullList = importResult['files'] # importFailList = importResult['fails'] # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541 ]: x.remove(i) ignore_x.append(i) # plus 1 because we are no longer 0 offset x = ",".join(map(lambda x: "C" + str(x + 1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'family': 'binomial', 'response': 'C379', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") # convert to binomial execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15) h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_GLM_enums_unbalanced(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm2 model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) testDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey, timeoutSecs=30, separator=colSepInt) y = colCount modelKey = 'glm_model' kwargs = { 'standardize': 0, 'destination_key': modelKey, 'response': 'C' + str(y + 1), 'max_iter': 200, 'family': 'binomial', 'n_folds': 0, 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.5, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 1e-12 }, { 'alpha': 0.0, 'lambda': 0 }, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) print "If we poll, we get a message saying it was cancelled by user??" glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm2 end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' glm_model = glm['glm_model'] _names = glm_model['_names'] modelKey = glm_model['_key'] coefficients_names = glm_model['coefficients_names'] submodels = glm_model['submodels'][0] beta = submodels['beta'] norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iteration > 20: raise Exception( "Why take so many iterations: %s in this glm2 training?" % iterations) # Score ********************************************** print "Problems with test data having different enums than train? just use train for now" testDataKey = hex_key h2o_cmd.runScore(dataKey=testDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_parse_time_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_time.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename colCount = 6 rowCount = 10 headerData = rand_header(colCount) write_syn_dataset(csvPathname, rowCount, colCount, headerData) for trial in range (1): rowData = rand_rowData() # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = csvFilename + "_" + str(trial) hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key) print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsA = inspect['numRows'] numColsA = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsA, numRows=numRowsA, noPrint=True) print summaryResult h2o_cmd.infoFromSummary(summaryResult) (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictA or enumSizeDictA: raise Exception("Should be empty? constantValuesDictA %s enumSizeDictA %s" % (constantValuesDictA, enumSizeDictA)) print "missingValuesListA", missingValuesListA # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty") self.assertEqual(numColsA, colCount) self.assertEqual(numRowsA, rowCount) # do a little testing of saving the key as a csv csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv" h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname) # remove the original parsed key. source was already removed by h2o h2o.nodes[0].remove_key(hex_key) # interesting. what happens when we do csv download with time data? start = time.time() parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key) print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds' inspect = h2o_cmd.runInspect(key=hex_key) numRowsB = inspect['numRows'] numColsB = inspect['numCols'] print "missingValuesListB", missingValuesListB summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100, numCols=numColsB, numRows=numRowsB, noPrint=True) (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \ h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False) if constantValuesDictB or enumSizeDictB: raise Exception("Should be empty? constantValuesDictB %s enumSizeDictB %s" % (constantValuesDictB, enumSizeDictB)) self.assertEqual(missingValuesListA, missingValuesListB, "missingValuesList mismatches after re-parse of downloadCsv result") self.assertEqual(numColsA, numColsB, "numCols mismatches after re-parse of downloadCsv result") # H2O adds a header to the csv created. It puts quotes around the col numbers if no header # but in this dataset we have a header too, so the row counts should be equal # if not, maybe the parse of our dataset didn't detect a row self.assertEqual(numRowsA, numRowsB, "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB) ) # FIX! should do some comparison of values? # maybe can use exec to checksum the columns and compare column list. # or compare to expected values? (what are the expected values for the number for time inside h2o?) # FIX! should compare the results of the two parses. The infoFromInspect result? ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect") h2o.check_sandbox_for_errors()
def test_GLM_many_rooz_enums(self): SYNDATASETS_DIR = h2o.make_syn_dir() if localhost: n = 4000 tryList = [ (n, 999, 'cI', 300), ] else: n = 5 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: # can randomly pick the row and col cases. ### colSepCase = random.randint(0,1) colSepCase = 1 # using the comma is nice to ensure no craziness if (colSepCase==0): colSepHexString = '01' else: colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0,1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase==0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) if DO_TEN_INTEGERS: csvFilename = 'syn_rooz_int10_' + str(rowCount) + 'x' + str(colCount) + '.csv' else: csvFilename = 'syn_rooz_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename # we allow some NAs in the list above (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseKey['destination_key'],exceptionOnMissingValues=False) y = colCount x = range(colCount) x = ",".join(map(str,x)) # kwargs = {'x': x, 'y': y, 'max_iter': 6, 'n_folds': 1, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'poisson', 'case_mode': '=', 'case': 0} kwargs = {'y': y, 'max_iter': 6, 'n_folds': 1, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'poisson', 'case_mode': '=', 'case': 0} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def sub_c3_nongz_fvec_long(self, csvFilenameList): # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution() # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") execExpr = 'B.hex=A.hex' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.checkKeyDistribution() execExpr = 'C.hex=B.hex' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.checkKeyDistribution() execExpr = 'D.hex=C.hex' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.checkKeyDistribution()
def test_parse_full_rand(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DEBUG: n = 20 else: n = 1000000 # from command line arg -long if h2o.long_test_case: repeat = 1000 scale = 10 # scale up the # of rows tryList = [ (n * scale, 3, 'cI', 300), ] else: repeat = 1 scale = 1 tryList = [ (n, 3, 'cI', 300), ] lastcolsHistory = [] for r in range(repeat): SEED_PER_FILE = random.randint(0, sys.maxint) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # same enum list/mapping, but different dataset? start = time.time() lastcols = write_syn_dataset(csvPathname, rowCount, colCount, scale=1, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_PER_FILE) elapsed = time.time() - start print "took %s seconds to create %s" % (elapsed, csvPathname) # why are we saving this? lastcolsHistory.append(lastcols) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0, timeoutSecs=60, separator=colSepInt, doSummary=DO_SUMMARY) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect( key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] h2o_cmd.infoFromInspect(inspect) # Each column should get .10 random NAs per iteration. Within 10%? missingValuesList = h2o_cmd.infoFromInspect(inspect) # print "missingValuesList", missingValuesList # for mv in missingValuesList: # self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv, # msg='mv %s is not approx. expected %s' % (mv, expectedNA)) # might have extra rows if numRows < rowCount: raise Exception( "Expect numRows %s >= rowCount %s since we can have extra eols" % (numRows, rowCount)) # numCols should be right? self.assertEqual(colCount, numCols) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=DISABLE_ALL_NA)
def test_GLM_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'y': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, 'thresholds': 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-4 }, { 'alpha': 0.25, 'lambda': 1e-6 }, { 'alpha': 0.0, 'lambda': 1e-8 }, { 'alpha': 0.5, 'lambda': 0.0 }, { 'alpha': 0.0, 'lambda': 0.0 }, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' GLMModel = glm['GLMModel'] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel['iterations'] modelKey = GLMModel['model_key'] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # if iterations > 20: # raise Exception("Why take so many iterations: %s in this glm training?" % iterations) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore( key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ( "resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ( "nullDev:\t", nullDev)
def test_GLM2_many_rooz_enums(self): SYNDATASETS_DIR = h2o.make_syn_dir() if 1 == 0 and localhost: n = 4000 tryList = [ (n, 999, 'cI', 300), ] else: n = 100 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # can randomly pick the row and col cases. ### colSepCase = random.randint(0,1) colSepCase = 1 # using the comma is nice to ensure no craziness if (colSepCase == 0): colSepHexString = '01' else: colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0, 1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase == 0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) if DO_TEN_INTEGERS: csvFilename = 'syn_rooz_int10_' + str(rowCount) + 'x' + str( colCount) + '.csv' else: csvFilename = 'syn_rooz_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename # we allow some NAs in the list above (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],exceptionOnMissingValues=False) y = colCount kwargs = { 'use_all_factor_levels': 1, 'response': y, 'max_iter': 6, 'n_folds': 1, 'alpha': 0.0, 'lambda': 1e-5, 'family': 'poisson' } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def sub_c2_rel_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 116561140 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath if len(h2o.nodes)==1: csvFilenameList= [ ("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600), ] else: csvFilenameList= [ ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] # for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]: for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]: x.remove(i) ignore_x.append(i) # increment by one, because we are no long zero offset! x = ",".join(map(lambda x: "C" + str(x+1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'family': 'binomial', 'x': x, 'y': 'C379', 'case': 15, 'case_mode': '>', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1, 64) # random length headers headerName = ''.join( [random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0, 1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0, 1) GZIP_HEADER = random.randint(0, 1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k, v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0, 1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0, 1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset( csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename( csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset( hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename( hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*' else: pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone == 0) and (kwargs['header'] == 1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1 == 0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = { 'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf } start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1,64) # random length headers headerName = ''.join([random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0,1) GZIP_HEADER = random.randint(0,1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k,v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN==" " or SEP_CHAR_GEN==",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0,1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0,1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*' else: pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1==0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf} start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def test_GLM2_enums_score_subset(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 500 tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'response': y, 'max_iter': 8, 'family': 'binomial', 'n_folds': 2, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' modelKey = glm['glm_model']['_key'] h2o_cmd.runScore(dataKey="score_" + hex_key, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.6)
if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseResult['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseResult directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file"
def test_find_numbers(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 3 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness if COL_SEP_HIVE: colSepHexString = '01' quoteChars = ",\'\"" # more choices for the unquoted string else: colSepHexString = '2c' # comma quoteChars = "" colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0,1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase==0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, quoteChars=quoteChars) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)
def test_rf_enums_score_superset_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = 3000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for rf model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' ntrees = 5 kwargs = { 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': ntrees, 'validation': scoreDataKey, } start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees) predictKey = 'Predict.hex' h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
def test_GLM_enums_unbalanced(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt) y = colCount kwargs = { 'y': y, 'max_iter': 200, 'family': 'binomial', 'n_folds': 10, 'alpha': 0, 'lambda': 0, 'thresholds': 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-4}, {'alpha': 0.25, 'lambda': 1e-6}, {'alpha': 0.0, 'lambda': 1e-8}, {'alpha': 0.5, 'lambda': 0.0}, {'alpha': 0.0, 'lambda': 0.0}, ] # Try each one h2o.beta_features = True for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' GLMModel = glm['GLMModel'] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel['iterations'] modelKey = GLMModel['model_key'] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) if iterations > 20: raise Exception("Why take so many iterations: %s in this glm training?" % iterations) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' ### print h2o.dump_json(glmScore) classErr = glmScore['validation']['classErr'] auc = glmScore['validation']['auc'] err = glmScore['validation']['err'] nullDev = glmScore['validation']['nullDev'] resDev = glmScore['validation']['resDev'] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev']) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
def test_GLM2_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'xyz' kwargs = { 'n_folds': 0, 'destination_key': modelKey, 'response': y, 'max_iter': 200, 'family': 'binomial', 'alpha': 0, 'lambda': 0, } start = time.time() updateList = [ { 'alpha': 0.5, 'lambda': 1e-5 }, # {'alpha': 0.25, 'lambda': 1e-4}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex", timeoutSecs=30, separator=colSepInt) h2o_cmd.runScore(dataKey="B.hex", modelKey=modelKey, vactual='C' + str(y + 1), vpredict=1, expectedAuc=0.45)
def test_rf_many_rooz_enums_fvec(self): SYNDATASETS_DIR = h2o.make_syn_dir() if 1==0 and localhost: n = 4000 tryList = [ (n, 999, 'cI', 300), ] else: n = 100 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # can randomly pick the row and col cases. ### colSepCase = random.randint(0,1) colSepCase = 1 # using the comma is nice to ensure no craziness if (colSepCase==0): colSepHexString = '01' else: colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0,1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase==0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) if DO_TEN_INTEGERS: csvFilename = 'syn_rooz_int10_' + str(rowCount) + 'x' + str(colCount) + '.csv' else: csvFilename = 'syn_rooz_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename # we allow some NAs in the list above (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],exceptionOnMissingValues=False) y = colCount ntrees = 5 kwargs = { 'response': y, 'classification': 1, 'ntrees': ntrees, } start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", csvPathname, 'took', time.time() - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees) modelKey = rfResult['drf_model']['_key'] h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, vactual=colCount, vpredict=1, expectedAuc=0.5, doAUC=False)
def test_parse_rand_enum_compress(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DEBUG: n = 20 else: n = 1000000 # from command line arg -long if h2o.long_test_case: repeat = 1000 scale = 10 # scale up the # of rows tryList = [ (n*scale, 1, 'cI', 300), (n*scale, 1, 'cI', 300), (n*scale, 1, 'cI', 300), ] else: repeat = 1 scale = 1 tryList = [ (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] lastcolsHistory = [] enumList = create_enum_list(listSize=ENUMS_NUM) for r in range(repeat): SEED_PER_FILE = random.randint(0, sys.maxint) for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # same enum list/mapping, but different dataset? start = time.time() lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, scale=1, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_PER_FILE) elapsed = time.time() - start print "took %s seconds to create %s" % (elapsed, csvPathname) # why are we saving this? lastcolsHistory.append(lastcols) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0, timeoutSecs=30, separator=colSepInt, doSummary=DO_SUMMARY) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] h2o_cmd.infoFromInspect(inspect) # Each column should get .10 random NAs per iteration. Within 10%? missingValuesList = h2o_cmd.infoFromInspect(inspect) # print "missingValuesList", missingValuesList # for mv in missingValuesList: # self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv, # msg='mv %s is not approx. expected %s' % (mv, expectedNA)) self.assertEqual(rowCount, numRows) self.assertEqual(colCount, numCols) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=DISABLE_ALL_NA)
def test_GLM_ints_unbalanced(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, "cD", 300), (n, 2, "cE", 300), (n, 4, "cF", 300), (n, 8, "cG", 300), (n, 16, "cH", 300), (n, 32, "cI", 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = "2c" # comma colSepChar = colSepHexString.decode("hex") colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = "0a" # newline rowSepChar = rowSepHexString.decode("hex") print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv" csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset( csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar ) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset( csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, ) parseResult = h2i.import_parse( path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt ) print csvFilename, "parse time:", parseResult["response"]["time"] print "Parse result['destination_key']:", parseResult["destination_key"] print "\n" + csvFilename ( missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict, ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True) y = colCount kwargs = { "y": y, "max_iter": 200, "family": "binomial", "n_folds": 10, "alpha": 0, "lambda": 0, "thresholds": 0.5, # 'case_mode': '=', # 'case': 0, } start = time.time() updateList = [ {"alpha": 0.5, "lambda": 1e-4}, {"alpha": 0.25, "lambda": 1e-6}, {"alpha": 0.0, "lambda": 1e-8}, {"alpha": 0.5, "lambda": 0.0}, {"alpha": 0.0, "lambda": 0.0}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" GLMModel = glm["GLMModel"] # submodels0 = GLMModel['submodels'][0] iterations = GLMModel["iterations"] modelKey = GLMModel["model_key"] h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # if iterations > 20: # raise Exception("Why take so many iterations: %s in this glm training?" % iterations) parseResult = h2i.import_parse( path=csvScorePathname, schema="put", hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt ) start = time.time() # score with same dataset (will change to recreated dataset with one less enum glmScore = h2o_cmd.runGLMScore( key=parseResult["destination_key"], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs ) print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds" ### print h2o.dump_json(glmScore) classErr = glmScore["validation"]["classErr"] auc = glmScore["validation"]["auc"] err = glmScore["validation"]["err"] nullDev = glmScore["validation"]["nullDev"] resDev = glmScore["validation"]["resDev"] h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs) print "classErr:", classErr print "err:", err print "auc:", auc print "resDev:", resDev print "nullDev:", nullDev if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation["resDev"]) raise Exception(emsg) # what is reasonable? # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err) # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) if math.isnan(err): emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err) raise Exception(emsg) if math.isnan(resDev): emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev) raise Exception(emsg) if math.isnan(nullDev): emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
def test_find_numbers(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = 3 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness if COL_SEP_HIVE: colSepHexString = '01' quoteChars = ",\'\"" # more choices for the unquoted string else: colSepHexString = '2c' # comma quoteChars = "" colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0, 1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase == 0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, quoteChars=quoteChars) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True)
def test_GLM_many_enums(self): SYNDATASETS_DIR = h2o.make_syn_dir() if not localhost: n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] else: n = 150 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), (n, 10, 'cLA', 300), (n, 11, 'cDA', 300), (n, 12, 'cEA', 300), (n, 13, 'cFA', 300), (n, 14, 'cGA', 300), (n, 15, 'cHA', 300), (n, 16, 'cIA', 300), (n, 17, 'cJA', 300), (n, 19, 'cKA', 300), (n, 20, 'cLA', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: # just randomly pick the row and col cases. colSepCase = random.randint(0, 1) colSepCase = 1 # using the comma is nice to ensure no craziness if (colSepCase == 0): colSepHexString = '01' quoteChars = ",\'\"" # more choices for the unquoted string else: colSepHexString = '2c' # comma quoteChars = "" colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar print "colSepInt", colSepInt rowSepCase = random.randint(0, 1) # using this instead, makes the file, 'row-readable' in an editor if (rowSepCase == 0): rowSepHexString = '0a' # newline else: rowSepHexString = '0d0a' # cr + newline (windows) \r\n rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar, quoteChars=quoteChars) # FIX! does 'separator=' take ints or ?? hex format # looks like it takes the hex string (two chars) parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30, separator=colSepInt) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey[ 'destination_key'] # We should be able to see the parse result? ### inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True) y = colCount kwargs = { 'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'case_mode': '=', 'case': 0 } start = time.time() ### glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds'
def test_rf_enums_mappings(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (n, 1, 'cD', 300), # (n, 2, 'cE', 300), # (n, 3, 'cF', 300), # (n, 4, 'cG', 300), # (n, 5, 'cH', 300), # (n, 6, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), (ROWS, COLS, 'cI', 300), ] # SEED_FOR_TRAIN = random.randint(0, sys.maxint) SEED_FOR_TRAIN = 1234567890 SEED_FOR_SCORE = 9876543210 errorHistory = [] enumHistory = [] lastcolsTrainHistory = [] lastcolsScoreHistory = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: enumList = create_enum_list(listSize=ENUMS) # reverse the list enumList.reverse() # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename # use same enum List enumListForScore = enumList print "Creating random", csvPathname, "for rf model building" lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN) lastcolsTrainHistory.append(lastcols) print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)" # same enum list/mapping, but different dataset? lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE) lastcolsScoreHistory.append(lastcols) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' # limit depth and number of trees to accentuate the issue with categorical split decisions # use mtries so both look at all cols at every split? doesn't matter for speedrf # does speedrf try one more time? with 3 cols, mtries=2, so another try might # get a look at the missing col # does matter for drf2. does it "just stop" # trying mtries always looking at all columns or 1 col might be interesting if SPEEDRF: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'ntrees': 1, 'max_depth': 100, # 'oobee': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'mtries': COLS, } elif GBM: kwargs = { 'destination_key': modelKey, 'response': y, 'validation': scoreDataKey, 'seed': 123456789, # 'learn_rate': .1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'classification': 1, } else: kwargs = { 'sample_rate': 0.999, 'destination_key': modelKey, 'response': y, 'classification': 1, 'ntrees': 1, 'max_depth': 100, 'min_rows': 1, 'validation': hex_key, # 'validation': scoreDataKey, 'seed': 123456789, 'nbins': 1024, 'mtries': COLS, } for r in range(2): start = time.time() if GBM: gbmResult = h2o_cmd.runGBM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "gbm end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' # print h2o.dump_json(gbmResult) (classification_error, classErrorPctList, totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult) elif SPEEDRF: rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "speedrf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) else: rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "rf end on ", parseResult[ 'destination_key'], 'took', time.time( ) - start, 'seconds' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult) h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5) errorHistory.append(classification_error) enumHistory.append(enumList) print "error from all runs on this dataset (with different enum mappings)" print errorHistory for e in enumHistory: print e print "last row from all train datasets, as integer" for l in lastcolsTrainHistory: print l print "last row from all score datasets, as integer" for l in lastcolsScoreHistory: print l
def test_exec_enums_rand_cut2(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ # (n, 10, 9, 'cE', 300), (n, 1, 1, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression MAX_COLS_IN_EXPR = iColCount cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1,MAX_COLS_IN_EXPR)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # randomly pick == or != if random.randint(0,1)==0: cutExprList.append('p$C'+str(i+1)+'!='+c) else: cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' & '.join(cutExprList) # print "cutExpr:", cutExpr # just extract one output col (the first one) rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount+1) # print "rowExpr:", rowExpr print rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* src_key = csvFilename parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J'+src_key, timeoutSecs=200) parseResult = h2i.parse_only(pattern='*'+src_key, hex_key=hex_key, timeoutSecs=800) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) pNumRows = inspect['numRows'] pNumCols = inspect['numCols'] # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(CUT_LOOP_CNT): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount column = 0 start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) #**************************************************************** # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col" quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=hex_key, column='C'+str(iColCount+1), quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0) elapsed = time.time() - start h2p.red_print(hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount+1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result']) print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.' quantileTime = elapsed #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_exec_enums_rand_cut(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 3, 2, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] for j in range(CUT_EXPR_CNT): print "Creating", CUT_EXPR_CNT, 'cut expressions' # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column # cel = colEnumList[c] cel = colEnumList # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' && '.join(cutExprList) print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] rowExpr = '%s[%s,];' % (hex_key, cutExpr) print "rowExpr:", rowExpr rowExprList.append(rowExpr) print "j:", j # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # print h2o.dump_json(inspect) rSummary = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(rSummary) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==0: start = time.time() e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1)) elapsed = time.time() - start print "exec 1 took", elapsed, "seconds." execTime = elapsed if 1==1: start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." if 1==0: gKey = random.choice(eKeys) # do a 2nd random to see if things blow up start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey)) elapsed = time.time() - start print "exec 3 took", elapsed, "seconds." if 1==1: inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec_enums_rand_cut2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ # (n, 10, 9, 'cE', 300), (n, 1, 1, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression MAX_COLS_IN_EXPR = iColCount cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1, MAX_COLS_IN_EXPR)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings if 1 == 1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # randomly pick == or != if random.randint(0, 1) == 0: cutExprList.append('p$C' + str(i + 1) + '!=' + c) else: cutExprList.append('p$C' + str(i + 1) + '==' + c) cutExpr = ' & '.join(cutExprList) # print "cutExpr:", cutExpr # just extract one output col (the first one) rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount + 1) # print "rowExpr:", rowExpr print rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* src_key = csvFilename parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J' + src_key, timeoutSecs=200) parseResult = h2i.parse_only(pattern='*' + src_key, hex_key=hex_key, timeoutSecs=800) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) pNumRows = inspect['numRows'] pNumCols = inspect['numCols'] # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception( "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1 == 1: a = 'a=c(1,2,3);' + ';'.join( ['a[,%s]=a[,%s-1]' % (i, i) for i in range(2, colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(CUT_LOOP_CNT): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount column = 0 start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) #**************************************************************** # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col" quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=hex_key, column='C' + str(iColCount + 1), quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0) elapsed = time.time() - start h2p.red_print( hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount + 1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result']) print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.' quantileTime = elapsed #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_parse_multi_header_rand(self): ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerChoices = ['aA','aB','aC','aD','aE','aF','aG','aH','aI', 'tomas'] # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ # FIX! one fails count for now # (1, 5, 9, 'cA', 60, 0), (1, 5, 9, 'cA', 60, 0), (1, 5, 25, 'cA', 60, 0), # try with col mismatch on header. # FIX! causes exception? don't test for now # (7, 300, 10, 'cA', 60, 0), # (7, 300, 10, 'cB', 60, 1), # (7, 300, 10, 'cC', 60, 2), # (7, 300, 10, 'cD', 60, 3), # (7, 300, 8, 'cA', 60, 0), # (7, 300, 8, 'cB', 60, 1), # (7, 300, 8, 'cC', 60, 2), # (7, 300, 8, 'cD', 60, 3), ] # so many random combos..rather than walk tryList, just do random for some amount of time for trial in range(50): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 HEADER_HAS_HDR_ROW = random.randint(0,1) DATA_HAS_HDR_ROW = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) ## DATA_FIRST_IS_COMMENT = random.randint(0,1) ## HEADER_FIRST_IS_COMMENT = random.randint(0,1) print "TEMPORARY: don't put any comments in" DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 # none is not legal SEP_CHAR_GEN = random.choice(paramsDict['separator']) print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', SEP_CHAR_GEN # they need to both use the same separator (h2o rule) hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"] print hh headerForHeader = SEP_CHAR_GEN.join(hh) # make these different hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"] headerForData = SEP_CHAR_GEN.join(hh) # random selection of parse param choices kwargs = {} for k,v in paramsDict.items(): aChoice = random.choice(v) # can tell h2o something different compared to what we actually used! if k == 'separator': if aChoice: sepChar = aChoice sepCharInt = ord(aChoice) # make it an integer for h2o else: sepChar = ',' # default char for None, need it for header/data file creation sepCharInt = None aChoice = sepCharInt kwargs[k] = aChoice # FOR NOW: ..override the rand choice if it exists, so we can parse and expect 'A' to be found # match what was gen'ed if choice is not None if kwargs['separator']: if SEP_CHAR_GEN==" " or SEP_CHAR_GEN==",": # parse doesn't auto-detect tab. will autodetect space and comma del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # create data files for fileN in range(fileNum): csvFilename = 'syn_data_' + str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone # create the header file hdrFilename = 'syn_header_' + str(SEED) + "_" + str(trial) + "_" + rowxcol + '.csv' hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) if PARSE_PATTERN_INCLUDES_HEADER: # only include header file data rows if the parse pattern includes it totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w xs = h2o.nodes[0].import_files(SYNDATASETS_DIR)['keys'] headerKey = [x for x in xs if hdrFilename in x][0] dataKey = [x for x in xs if csvFilename not in x][0] # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'syn_header': kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'syn_data': kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None print "If header_from_file= is used, we are currently required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = '*syn_*'+str(trial)+"_"+rowxcol+'*' else: pattern = '*syn_data_*'+str(trial)+"_"+rowxcol+'*' parseResult = h2o.nodes[0].parse(pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult['destination_key'] print 'parse time:', parseResult['response']['time'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # more reporting: (we can error here if extra col in header, causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['num_cols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData ### print (headerRowsDone!=0), (kwargs['header']==1), DATA_HAS_HDR_ROW, (kwargs['header_from_file'] is not None) if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 self.assertEqual(inspect['num_rows'], totalDataRows, "parse created result with the wrong number of rows (header rows don't count) h2o: %s gen'ed: %s" % \ (inspect['num_rows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it h2oShouldSeeHeader = (HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)) or DATA_HAS_HDR_ROW if h2oShouldSeeHeader: kwargs = {'sample': 75, 'depth': 25, 'ntree': 1, 'ignore': 'A'} else: kwargs = {'sample': 75, 'depth': 25, 'ntree': 1} start = time.time() elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
def test_GLM2_ints_unbalanced(self): h2o.beta_features = True ### h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() n = 2000 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 4, 'cF', 300), (n, 8, 'cG', 300), (n, 16, 'cH', 300), (n, 32, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list() # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList,5) print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult['destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'xyz' kwargs = { 'n_folds': 0, 'destination_key': modelKey, 'response': y, 'max_iter': 200, 'family': 'binomial', 'alpha': 0, 'lambda': 0, } start = time.time() updateList= [ {'alpha': 0.5, 'lambda': 1e-5}, # {'alpha': 0.25, 'lambda': 1e-4}, ] # Try each one for updateDict in updateList: print "\n#################################################################" print updateDict kwargs.update(updateDict) glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex", timeoutSecs=30, separator=colSepInt) h2o_cmd.runScore(dataKey="B.hex", modelKey=modelKey, vactual='C' + str(y+1), vpredict=1, expectedAuc=0.6)
def test_parse_utf8_3(self): SYNDATASETS_DIR = h2o.make_syn_dir() if DEBUG: n = 20 else: n = 10000 n = 1000 n = 500 # from command line arg -long if h2o.long_test_case: repeat = 1000 else: repeat = 50 scale = 1 tryList = [ (n, 3, 'cI', 300), (n, 3, 'cI', 300), (n, 3, 'cI', 300), ] for r in range(repeat): for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # same enum list/mapping, but different dataset? start = time.time() write_syn_dataset(csvPathname, rowCount, colCount, scale=1, colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEEDPERFILE) elapsed = time.time() - start print "took %s seconds to create %s" % (elapsed, csvPathname) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0, timeoutSecs=60, separator=colSepInt, doSummary=DO_SUMMARY) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) numCols = inspect['numCols'] numRows = inspect['numRows'] h2o_cmd.infoFromInspect(inspect) # Each column should get .10 random NAs per iteration. Within 10%? missingValuesList = h2o_cmd.infoFromInspect(inspect) # print "missingValuesList", missingValuesList # for mv in missingValuesList: # self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv, # msg='mv %s is not approx. expected %s' % (mv, expectedNA)) # might have extra rows if numRows!=rowCount: raise Exception("Expect numRows %s = rowCount %s because guaranteed not to have extra eols" % \ (numRows, rowCount)) # numCols should be right? self.assertEqual(colCount, numCols) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1==0: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800), # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600), ] if 1==1: importFolderPath = '/home/0xdiag/datasets/more1_1200_link' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), # for now, take too long on 2x100GB heap on 164 # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath csvFilenameAll = [ # this should hit the "more" files too? ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600), ] if 1==0: importFolderPath = '/home2/0xdiag/datasets' print "Using non-.gz'ed files in", importFolderPath csvFilenameAll = [ # I use different files to avoid OS caching effects ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700), # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200), # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700), # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700), ] if 1==0: importFolderPath = '/home/0xdiag/datasets/standard' print "Using .gz'ed files in", importFolderPath # all exactly the same prior to gzip! # could use this, but remember import folder -> import folder s3 for jenkins? # how would it get it right? # os.path.getsize(f) csvFilenameAll = [ # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700), # 100 files takes too long on two machines? # ("covtype200x.data", "covtype200x.data", 15033863400, 700), # I use different files to avoid OS caching effects # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700), # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700), # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700), # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700), ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200), ("covtype200x.data", "covtype200x.data", covtype200xSize, 700), # do it twice # ("covtype.data", "covtype.data"), # ("covtype20x.data", "covtype20x.data"), # "covtype200x.data", # "100million_rows.csv", # "200million_rows.csv", # "a5m.csv", # "a10m.csv", # "a100m.csv", # "a200m.csv", # "a400m.csv", # "a600m.csv", # "billion_rows.csv.gz", # "new-poker-hand.full.311M.txt.gz", ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 tryHeap = 28 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] pollTimeoutSecs = 120 retryDelaySecs = 10 jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks' jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" jea = ' -Dcom.sun.management.jmxremote.port=54330' + \ ' -Dcom.sun.management.jmxremote.authenticate=false' + \ ' -Dcom.sun.management.jmxremote.ssl=false' + \ ' -Dcom.sun.management.jmxremote' + \ ' -Dcom.sun.management.jmxremote.local.only=false' jea = ' -Dlog.printAll=true' for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(base_port=base_port, # java_extra_args=jea, enable_benchmark_log=True) # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 for trial in range(trialMax): importFolderResult = h2i.setupImportFolder(None, importFolderPath) importFullList = importFolderResult['files'] importFailList = importFolderResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # BUG here? if not noPoll: # We should be able to see the parse result? h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=False) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseKey['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice('+origKey+',1,200)' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRFOnly takes the parseKey directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]: x.remove(i) x = ",".join(map(str,x)) GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** h2o_cmd.checkKeyDistribution() h2o_cmd.deleteCsvKey(csvFilename, importFolderResult) ### time.sleep(3600) h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def test_GLM2_enums_score_superset(self): h2o.beta_features = True print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?" SYNDATASETS_DIR = h2o.make_syn_dir() n = 200 tryList = [ (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: # using the comma is nice to ensure no craziness colSepHexString = '2c' # comma colSepChar = colSepHexString.decode('hex') colSepInt = int(colSepHexString, base=16) print "colSepChar:", colSepChar rowSepHexString = '0a' # newline rowSepChar = rowSepHexString.decode('hex') print "rowSepChar:", rowSepChar SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename enumList = create_enum_list(listSize=10) # use half of the enums for creating the scoring dataset enumListForScore = random.sample(enumList, 5) # add a extra enum for scoring that's not in the model enumList enumListForScore.append("xyzzy") print "Creating random", csvPathname, "for glm model building" write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)" write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, separator=colSepInt) print "Parse result['destination_key']:", parseResult[ 'destination_key'] print "\n" + csvFilename (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True) y = colCount modelKey = 'enums' kwargs = { 'destination_key': modelKey, 'response': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 'family': 'binomial' } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult[ 'destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) scoreDataKey = "score_" + hex_key parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt) # Score ******************************* # this messes up if you use case_mode/case_vale above predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=scoreDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # just get a predict and AUC on the same data. has to be binomial result resultAUC = h2o.nodes[0].generate_auc(thresholds=None, actual=scoreDataKey, predict='Predict.hex', vactual=y, vpredict=1) auc = resultAUC['AUC'] self.assertAlmostEqual( auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=scoreDataKey, predict=predictKey, vactual='C' + str(y + 1), vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm)