def test_plot_remove_keys_manyfiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() print "Remember, the parse only deletes what got parsed. We import the folder. So we double import. That should work now" tryList = [ ("file_1[0-9].dat.gz", 'c10', 600), ("file_[1-2][0-9].dat.gz", 'c20', 600), ("file_[1-4][0-9].dat.gz", 'c40', 600), ("file_[1-8][0-9].dat.gz", 'c80', 600), # don't do this case. timesout at 300 sec on polling with 172-180 # ("file_[1-2][1-8][0-9].dat.gz", 'c160', 1200), ] xList = [] eList = [] fList = [] importFolderPath = "manyfiles-nflx-gz" for (csvFilePattern, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, retryDelaySecs=3, timeoutSecs=timeoutSecs, doSummary=False) parseElapsed = time.time() - start print "Parse only:", parseResult['destination_key'], "took", parseElapsed, "seconds" h2o.check_sandbox_for_errors() # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) parsedBytes = inspect['byteSize'] node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(parsedBytes) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1==1: xLabel = 'parsedBytes' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rapids_vec_fail1(self): start = time.time() xList = [] eList = [] fList = [] bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): for trial in range(int(1e6),int(100e6),int(10e6)): # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' execExpr = '(= !v (+ %v %v))' start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) xList.append(length) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'vector length' eLabel = 'elapsed (create v)' fLabel = 'elapsed (v = v + v)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_plot_remove_keys(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 100, 'cG', 400), (200000, 100, 'cH', 400), (400000, 100, 'cI', 400), (800000, 100, 'cJ', 400), (1000000, 100, 'cK', 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES-1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) iA = h2o_cmd.InspectObj(pA.parse_key) parseElapsed = pA.python_elapsed parse_key = pA.parse_key byteSize = pA.byteSize numRows = iA.numRows numCols = iA.numCols print parse_key, parseElapsed, byteSize, numRows, numCols labelList = iA.labelList node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(byteSize) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1==1: xLabel = 'byteSize' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_speedrf_covtype_fvec(self): importFolderPath = "standard" # Parse Train ****************************************************** # csvTrainFilename = 'covtype.data' csvTrainFilename = 'covtype20x.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) xList = [] eList = [] fList = [] trial = 0 for trial in range(10): timeoutSecs = 30 # have unique model names start = time.time() summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print 'summary end', trial, 'on', csvTrainPathname, 'took', elapsed, 'seconds' fList.append(elapsed) eList.append(elapsed) if DO_PLOT: xLabel = 'trial' xList.append(trial) if DO_PLOT: eLabel = 'elapsed' fLabel = 'elapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_RF_many_cols_enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), (10000, 3000, 'cK', 3600), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'RFModelKey' # Parse (train)**************************************** start = time.time() parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # RF(train iterate)**************************************** ntrees = 10 for max_depth in [5,10,20,40]: params = { 'nbins': 1024, 'classification': 1, 'ntrees': ntrees, 'max_depth': max_depth, 'response': 'C' + str(numCols-1), 'ignored_cols_by_name': None, } print "Using these parameters for RF: ", params kwargs = params.copy() trainStart = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) rfResult["drf_model"] = rfResult.pop("speedrf_model") errsLast = rfResult['drf_model']['errs'][-1] print "RF 'errsLast'", errsLast cm = rfResult['drf_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # just plot the last one if 1==1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rapids_vec_fail(self): start = time.time() xList = [] eList = [] fList = [] bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): for trial in range(int(1e6), int(8e6), int(1e6)): # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !vreal (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # change it to all 1s? v = v==0 execExpr = '(= !vint (N %vreal #0))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) # comparing the sum times for int vs real..maybe the other guy isn't real. at least: different compression # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' # recursively expand execExpr = '(= !v2 (+ %vint <patt>))' for j in range(3): execExpr = re.sub('<patt>', '(+ %vint <patt>)', execExpr) # last one execExpr = re.sub('<patt>', '(+ %vint %vint)', execExpr) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed2 = time.time() - start execExpr = '(= !v1 (+ %vreal %vreal))' start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start inspectResult = h2o_cmd.runInspect(key='vreal') h2o_cmd.infoFromInspect(inspectResult) inspectResult = h2o_cmd.runInspect(key='vint') h2o_cmd.infoFromInspect(inspectResult) summaryResult = h2o_cmd.runSummary(key='vreal') if execResult['num_rows']: keys.append(execExpr) xList.append(length) eList.append(elapsed1) fList.append(elapsed2) if 1 == 1: xLabel = 'vector length' eLabel = 'elapsed (v1 = vint + vint)' fLabel = 'elapsed (v2 = vreal + vreal)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_exec_enums_rand_cut2(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ # (n, 10, 9, 'cE', 300), (n, 1, 1, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression MAX_COLS_IN_EXPR = iColCount cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1,MAX_COLS_IN_EXPR)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # randomly pick == or != if random.randint(0,1)==0: cutExprList.append('p$C'+str(i+1)+'!='+c) else: cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' & '.join(cutExprList) # print "cutExpr:", cutExpr # just extract one output col (the first one) rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount+1) # print "rowExpr:", rowExpr print rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* src_key = csvFilename parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J'+src_key, timeoutSecs=200) parseResult = h2i.parse_only(pattern='*'+src_key, hex_key=hex_key, timeoutSecs=800) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) pNumRows = inspect['numRows'] pNumCols = inspect['numCols'] # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(CUT_LOOP_CNT): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount column = 0 start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) #**************************************************************** # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col" quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=hex_key, column='C'+str(iColCount+1), quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0) elapsed = time.time() - start h2p.red_print(hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount+1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result']) print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.' quantileTime = elapsed #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_RF_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ] tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), # (10000, 3000, 'cK', 3600), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'RFModelKey' # Parse (train)**************************************** start = time.time() parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # RF(train iterate)**************************************** ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'nbins': 1024, 'classification': 1, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(numCols - 1), 'ignored_cols_by_name': None, } print "Using these parameters for RF: ", params kwargs = params.copy() trainStart = time.time() rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str( max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) errsLast = rfResult['drf_model']['errs'][-1] print "RF 'errsLast'", errsLast cm = rfResult['drf_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # just plot the last one if 1 == 1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i == (TRIES - 1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, retryDelaySecs=5, doSimpleCheck=False) print "rfView:", h2o.dump_json(rfView) rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual( len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict" ) # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec_enums_rand_cut2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ # (n, 10, 9, 'cE', 300), (n, 1, 1, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression MAX_COLS_IN_EXPR = iColCount cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1, MAX_COLS_IN_EXPR)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings if 1 == 1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # randomly pick == or != if random.randint(0, 1) == 0: cutExprList.append('p$C' + str(i + 1) + '!=' + c) else: cutExprList.append('p$C' + str(i + 1) + '==' + c) cutExpr = ' & '.join(cutExprList) # print "cutExpr:", cutExpr # just extract one output col (the first one) rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount + 1) # print "rowExpr:", rowExpr print rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* src_key = csvFilename parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J' + src_key, timeoutSecs=200) parseResult = h2i.parse_only(pattern='*' + src_key, hex_key=hex_key, timeoutSecs=800) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) pNumRows = inspect['numRows'] pNumCols = inspect['numCols'] # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception( "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1 == 1: a = 'a=c(1,2,3);' + ';'.join( ['a[,%s]=a[,%s-1]' % (i, i) for i in range(2, colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(CUT_LOOP_CNT): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount column = 0 start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) #**************************************************************** # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col" quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=hex_key, column='C' + str(iColCount + 1), quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0) elapsed = time.time() - start h2p.red_print( hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount + 1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result']) print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.' quantileTime = elapsed #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_exec_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 3, 2, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] for j in range(CUT_EXPR_CNT): print "Creating", CUT_EXPR_CNT, 'cut expressions' # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column # cel = colEnumList[c] cel = colEnumList # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' && '.join(cutExprList) print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] rowExpr = '%s[%s,];' % (hex_key, cutExpr) print "rowExpr:", rowExpr rowExprList.append(rowExpr) print "j:", j # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # print h2o.dump_json(inspect) rSummary = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(rSummary) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==0: start = time.time() e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1)) elapsed = time.time() - start print "exec 1 took", elapsed, "seconds." execTime = elapsed if 1==1: start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." if 1==0: gKey = random.choice(eKeys) # do a 2nd random to see if things blow up start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey)) elapsed = time.time() - start print "exec 3 took", elapsed, "seconds." if 1==1: inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_covtype_train_test(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) ntrees = 2 # fails with 40 for max_depth in [40, 5]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_params_rand2(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect( key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set params = { 'response': 54, 'ignored_cols_by_name': 'C1,C2,C3,C4,C5', 'ntrees': 2, 'validation': parseTestResult['destination_key'], } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1][ '_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) if 'max_depth' in params and params['max_depth']: xList.append(params['max_depth']) eList.append(pctWrongTrain) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrongTrain' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ (n, 10, 9, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1, iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key('p') for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn('==', c, pKey[:, i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn('&', cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign('b', [1, 2, 3]) # could also append 1 col at a time, by assigning to the next col number? Assign('a', Cbind(['b' for i in range(colCount)])) for eKey in eKeys: Assign(eKey, 'a') ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] if 1 == 1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 30, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 70, 30), (1000000, 5, 'cD', 0, 100, 30), (1000000, 5, 'cD', 0, 130, 30), (1000000, 5, 'cD', 0, 160, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", (maxInt - minInt) + 1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) groups = execResult['num_rows'] maxExpectedGroups = ((maxInt - minInt) + 1)**2 h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s" % (groups, maxExpectedGroups)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) # should be same answer in both cases execExpr = "d=sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) print "execResult", h2o.dump_json(execResult) self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex' ) ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect( key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5, 10, 20, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if h2o.localhost: tryList = [ (10000, 100, 'cA', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, 'cD', 300), (10000, 200, 'cE', 300), (10000, 300, 'cF', 300), (10000, 400, 'cG', 300), (10000, 500, 'cH', 300), (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' hdrFilename = 'hdr_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' # Parse (train)**************************************** parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) # hack elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** ntrees = 5 prefixList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] # for max_depth in [5,10,20,40]: for max_depth in [5, 10, 20]: # PARSE a new header**************************************** print "Creating new header", hdrPathname prefix = prefixList.pop(0) write_syn_header(hdrPathname, rowCount, colCount, prefix) # upload and parse the header to a hex hdr_hex_key = prefix + "_hdr.hex" parseHdrResult = h2i.import_parse(bucket=None, path=hdrPathname, schema='put', header=1, # REQUIRED! otherwise will interpret as enums hex_key=hdr_hex_key, timeoutSecs=timeoutSecs, doSummary=False) # Set Column Names (before autoframe is created) h2o.nodes[0].set_column_names(source=hex_key, copy_from=hdr_hex_key) # GBM print "response col name is changing each iteration: parsing a new header" params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': prefix + "_response", 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # works if you delete the autoframe ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe') # just plot the last one if DO_PLOT: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_PCA_many_cols_enum_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10, 5, 'cD', 300), (10, 10, 'cD', 300), (10, 20, 'cD', 300), (10, 40, 'cD', 300), (10, 80, 'cD', 300), (10, 160, 'cD', 300), (10, 200, 'cD', 300), ] xList = [] eList = [] fList = [] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** start = time.time() modelKey = 'PCAModelKey' # Parse **************************************** parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # PCA(tolerance iterate)**************************************** for tolerance in [0.1]: # for tolerance in [i/10.0 for i in range(1)]: params = { 'ignored_cols': 'C1', 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } print "Using these parameters for PCA: ", params kwargs = params.copy() PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) print "Checking PCA results: " pcaView = h2o_cmd.runPCAView(modelKey = modelKey) h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print # xList.append(ntrees) xList.append(numCols) eList.append(tolerance) fList.append(elapsed) # just plot the last one if 1==1: xLabel = 'numCols' eLabel = 'tolerance' fLabel = 'elapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rapids_cbind_vec(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): # for trial in range(int(1e6),int(200e6),int(1e6)): for trial in [int(100e6)]: # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' # cols = 100 xList = [] eList = [] fList = [] for trial2 in range(0, 16): # for trial2 in range(0, 10): # fails. Post size? # for trial2 in range(0, 16): col = 2 ** trial2 # assert col < 16384, "h2o can't take col == 16384 or more" vString = ' '.join(['%v' for x in range(col)]) execExpr = '(= !v2 (cbind %s))' % vString # FIX! check the colnames. 2 cols get C1 and C10? odd # try: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # except: # elapsed2 = 0 # h2p.red_print("ERROR: col = %s failed" % col) if 1==0: start = time.time() execExpr = '(sum %v2 %TRUE)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) elapsed1 = time.time() - start # xList.append(length) xList.append(col) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'col' eLabel = 'elapsed (sum)' fLabel = 'elapsed (cbind cols)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_quant_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if getpass.getuser() == 'kevin': tryList = [ (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] else: tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] # h2b.browseTheCloud() trial = 0 for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList: xList = [] eList = [] fList = [] # PARSE******************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False) csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] if not oColCount: iColCount = 0 if not oColCount: oColCount = numCols colCount = iColCount + oColCount for i in range(0, numCols): print "Column", i, "summary" h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i) # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) # print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict # start after the last input col levels = h2o.nodes[0].levels(source=hex_key) l = levels['levels'] for column in range(iColCount, iColCount + oColCount): if l[column]: print "Skipping", column, "because it's enum (says levels)" continue # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? start = time.time() # file has headers. use col index q = h2o.nodes[0].quantiles(source_key=hex_key, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] h2p.red_print("result:", q['result'], "quantile", quantile, "interpolated:", q['interpolated'], "iterations", q['iterations']) elapsed = time.time() - start print "quantile end on ", hex_key, 'took', elapsed, 'seconds.' quantileTime = elapsed # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() if 1 == 1: h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=column, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, use_genfromtxt=True, ) trial += 1 execTime = 0 xList.append(column) eList.append(execTime) fList.append(quantileTime) # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on took", elapsed, 'seconds.' #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'column (0 is first)' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_GBM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if localhost: tryList = [ (100000, 400, 'cA', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (100000, 100, 'cD', 300), (100000, 200, 'cE', 300), (100000, 500, 'cG', 300), (100000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE train**************************************** h2o.beta_features = False #turn off beta_features start = time.time() xList = [] eList = [] fList = [] h2o.beta_features = False modelKey = 'GBMModelKey' # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" # l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( # len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) l = '{:d} jvms, {:d}MB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] h2o_cmd.infoFromInspect(inspect, csvPathname) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True # was failing with 100 trees # ntrees = 100 # for max_depth in [5,10,20,40]: ntrees = 10 for max_depth in [5]: params = { 'learn_rate': .2, 'nbins': 10, # 1024 fail 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': num_cols-1, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if DO_PLOT_IF_KEVIN: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rapids_vec_fail(self): start = time.time() xList = [] eList = [] fList = [] bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): for trial in range(int(1e6),int(8e6),int(1e6)): # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !vreal (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # change it to all 1s? v = v==0 execExpr = '(= !vint (N %vreal #0))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) # comparing the sum times for int vs real..maybe the other guy isn't real. at least: different compression # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' # recursively expand execExpr = '(= !v2 (+ %vint <patt>))' for j in range(3): execExpr = re.sub('<patt>', '(+ %vint <patt>)', execExpr) # last one execExpr = re.sub('<patt>', '(+ %vint %vint)', execExpr) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed2 = time.time() - start execExpr = '(= !v1 (+ %vreal %vreal))' start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start inspectResult = h2o_cmd.runInspect(key='vreal') h2o_cmd.infoFromInspect(inspectResult) inspectResult = h2o_cmd.runInspect(key='vint') h2o_cmd.infoFromInspect(inspectResult) summaryResult = h2o_cmd.runSummary(key='vreal') if execResult['num_rows']: keys.append(execExpr) xList.append(length) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'vector length' eLabel = 'elapsed (v1 = vint + vint)' fLabel = 'elapsed (v2 = vreal + vreal)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_GBM_poker_1m(self): h2o.beta_features = True for trial in range(2): # PARSE train**************************************** h2o.beta_features = False #turn off beta_features start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' timeoutSecs = 900 # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = 'poker/poker-hand-testing.data' hex_key = 'poker-hand-testing.data.hex' parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True ntrees = 2 for max_depth in [5,10,20]: params = { 'learn_rate': .1, 'nbins': 10, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': numCols-1, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if DO_PLOT_IF_KEVIN: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(trial) eList.append(pctWrong) fList.append(trainElapsed) <<<<<<< HEAD xLabel = 'trial' ======= xLabel = 'trial. importance=0,1,default,...' >>>>>>> 50f5b1b8c94b6ce7cd5ec175fecdca811f41487f eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) if __name__ == '__main__': h2o.unit_main()
def test_GBM_many_cols_enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), # (10000, 3000, 'cK', 3600), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = hex_key elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True ntrees = 10 for max_depth in [5,10,20,40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(numCols-1), 'ignored_cols_by_name': None, } # both response variants should work? # if random.randint(0,1): # params['response'] = numCols-1, print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if 1==1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_plot_remove_keys(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 50, 'cG', 400, 400), (200000, 50, 'cH', 400, 400), (400000, 50, 'cI', 400, 400), (800000, 50, 'cJ', 400, 400), (1000000, 50, 'cK', 400, 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES-1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) start = time.time() print csvFilename, "parse starting" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) parseElapsed = time.time() - start print "Parse only:", parseResult['destination_key'], "took", parseElapsed, "seconds" h2o.check_sandbox_for_errors() # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) parsedBytes = inspect['byteSize'] node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(parsedBytes) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1==1: xLabel = 'parsedBytes' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec2_log_like_R(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' csvPathname = 'airlines/year2013.csv' # csvPathname = '1B/reals_100000x1000_15f.data' # csvPathname = '1B/reals_1000000x1000_15f.data' # csvPathname = '1B/reals_1000000x1_15f.data' # csvPathname = '1B/reals_1B_15f.data' # csvPathname = '1B/reals_100M_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) xList = [] eList = [] fList = [] for execExpr in initList: execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) for trial in range(300): for execExpr in exprList: # put the trial number into the temp for uniqueness execExpr = re.sub('Last.value', 'Last.value%s' % trial, execExpr) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) execTime = time.time() - start print 'exec took', execTime, 'seconds' c = h2o.nodes[0].get_cloud() c = c['nodes'] # print (h2o.dump_json(c)) k = [i['num_keys'] for i in c] v = [i['value_size_bytes'] for i in c] print "keys: %s" % " ".join(map(str,k)) print "value_size_bytes: %s" % " ".join(map(str,v)) # print "result:", result if DO_ORIG: if 'r1' in execExpr: xList.append(trial) eList.append(execTime) if 'log' in execExpr: fList.append(execTime) else: xList.append(trial) eList.append(execTime) fList.append(execTime) h2o.check_sandbox_for_errors() # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' if DO_ORIG: eLabel = 'time: Last.value<trial>.4 = r1[,c(1)]' fLabel = 'time: Last.value<trial>.7 = log(Last.value<trial>.6)' else: eLabel = 'time: Last.value.3 = r2+1' fLabel = 'time: Last.value.3 = r2+1' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_GBM_poker_1m(self): for trial in range(2): # PARSE train**************************************** h2o.beta_features = False #turn off beta_features start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' timeoutSecs = 900 # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = 'poker/poker-hand-testing.data' hex_key = 'poker-hand-testing.data.hex' parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True ntrees = 2 for max_depth in [5,10,20]: params = { 'learn_rate': .1, 'nbins': 10, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': num_cols-1, 'ignored_cols_by_name': None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if DO_PLOT_IF_KEVIN: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x+1), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % "C" + str(response+1) ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response+1), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response+1), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_quant_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if getpass.getuser()=='kevin': tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), ] else: tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] # h2b.browseTheCloud() trial = 0 for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList: xList = [] eList = [] fList = [] # PARSE******************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False) csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] if not oColCount: iColCount = 0 if not oColCount: oColCount = numCols colCount = iColCount + oColCount for i in range (0,numCols): print "Column", i, "summary" h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i); # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict # start after the last input col levels = h2o.nodes[0].levels(source=hex_key); l = levels['levels'] for column in range(iColCount, iColCount+oColCount): if l[column]: print "Skipping", column, "because it's enum (says levels)" continue # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? start = time.time() # file has headers. use col index q = h2o.nodes[0].quantiles(source_key=hex_key, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] h2p.red_print("result:", q['result'], "quantile", quantile, "interpolated:", q['interpolated'], "iterations", q['iterations']) elapsed = time.time() - start print "quantile end on ", hex_key, 'took', elapsed, 'seconds.' quantileTime = elapsed # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() if 1==0: h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=column, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, use_genfromtxt=True, ) trial += 1 execTime = 0 xList.append(column) eList.append(execTime) fList.append(quantileTime) # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on took", elapsed, 'seconds.' #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'column (0 is first)' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_exec_enums_rand_cut(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 3, 2, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] for j in range(CUT_EXPR_CNT): print "Creating", CUT_EXPR_CNT, 'cut expressions' # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column # cel = colEnumList[c] cel = colEnumList # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' && '.join(cutExprList) print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] rowExpr = '%s[%s,];' % (hex_key, cutExpr) print "rowExpr:", rowExpr rowExprList.append(rowExpr) print "j:", j # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # print h2o.dump_json(inspect) rSummary = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(rSummary) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==0: start = time.time() e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1)) elapsed = time.time() - start print "exec 1 took", elapsed, "seconds." execTime = elapsed if 1==1: start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." if 1==0: gKey = random.choice(eKeys) # do a 2nd random to see if things blow up start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey)) elapsed = time.time() - start print "exec 3 took", elapsed, "seconds." if 1==1: inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_params_rand2(self): h2o.beta_features = False bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex') ] for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # GBM (train iterate)**************************************** inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key']) paramsDict = define_gbm_params() for trial in range(3): h2o.beta_features = True # translate it (only really need to do once . out of loop? h2o_cmd.runInspect(key=parseTrainResult['destination_key']) ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # use this to set any defaults you want if the pick doesn't set params = { 'response': 54, 'ignored_cols_by_name': '0,1,2,3,4', 'ntrees': 2, 'validation': parseTestResult['destination_key'], } h2o_gbm.pickRandGbmParams(paramsDict, params) print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename if DO_PREDICT_CM: gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='predict', predict=predictKey, vpredict='predict', # choices are 7 (now) and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cms'][-1] # use the last one # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) if 'max_depth' in params and params['max_depth']: xList.append(params['max_depth']) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrongTrain' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rapids_rbind(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): # for trial in range(int(1e6),int(200e6),int(1e6)): ROWS = int(100e6) for trial in [ROWS]: # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) xList = [] eList = [] fList = [] # gets out of memory error if we rbind too much for trial2 in range(1, 8, 2): # for trial2 in range(0, 10): # fails. Post size? # for trial2 in range(0, 16): rows = ROWS * trial2 vString = ' '.join(['%v' for x in range(trial2)]) execExpr = '(= !v2 (rbind %s))' % vString start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) if 1==1: start = time.time() execExpr = '(sum %v2 %TRUE)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) elapsed1 = time.time() - start # xList.append(length) xList.append(rows) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'rows' eLabel = 'elapsed (sum)' fLabel = 'elapsed (rbind)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ (n, 10, 9, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key('p') for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn('==', c, pKey[:,i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn('&', cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign('b', [1,2,3]) # could also append 1 col at a time, by assigning to the next col number? Assign('a', Cbind(['b' for i in range(colCount)])) for eKey in eKeys: Assign(eKey, 'a') ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_plot_remove_keys_manyfiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() print "Remember, the parse only deletes what got parsed. We import the folder. So we double import. That should work now" tryList = [ ("file_1[0-9].dat.gz", 'c10', 400), ("file_[1-2][0-9].dat.gz", 'c20', 400), ("file_[1-4][0-9].dat.gz", 'c40', 400), ("file_[1-8][0-9].dat.gz", 'c80', 400), # don't do this case. timesout at 300 sec on polling with 172-180 # ("file_[1-2][1-8][0-9].dat.gz", 'c160', 1200), ] xList = [] eList = [] fList = [] importFolderPath = "manyfiles-nflx-gz" for (csvFilePattern, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, retryDelaySecs=3, timeoutSecs=timeoutSecs, doSummary=False) parseElapsed = time.time() - start print "Parse only:", parseResult[ 'destination_key'], "took", parseElapsed, "seconds" h2o.check_sandbox_for_errors() # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) parsedBytes = inspect['byteSize'] node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(parsedBytes) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1 == 1: xLabel = 'parsedBytes' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_many_cols_enum(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] if getpass.getuser() == 'kevin': # longer run tryList = [ (10000, 100, 'cA', 300), (10000, 300, 'cB', 500), # (10000, 500, 'cC', 700), # (10000, 700, 'cD', 3600), # (10000, 900, 'cE', 3600), # (10000, 1000, 'cF', 3600), # (10000, 1300, 'cG', 3600), # (10000, 1700, 'cH', 3600), # (10000, 2000, 'cI', 3600), # (10000, 2500, 'cJ', 3600), # (10000, 3000, 'cK', 3600), ] else: tryList = [ (10000, 100, 'cA', 100), (10000, 300, 'cC', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] modelKey = 'GBMModelKey' # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = hex_key elapsed = time.time() - start print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** h2o.beta_features = True ntrees = 10 for max_depth in [5,10,20,40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(numCols-1), 'ignored_cols_by_name': None, } # both response variants should work? # if random.randint(0,1): # params['response'] = numCols-1, print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) h2o.beta_features = False # just plot the last one if 1==1: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult['destination_key'] = trainKey elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Parse (test)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTestResult['destination_key'] for h2o" parseTestResult['destination_key'] = testKey elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 # randomly ignore a bunch of cols, just to make it go faster x = range(num_cols) del x[response] ignored_cols_by_name = ",".join(map(str,random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, 'ignored_cols_by_name': ignored_cols_by_name, } if FORCE_FAIL_CASE: params = {'learn_rate': 0.2, 'classification': None, 'min_rows': 10, 'ntrees': 10, 'response': 378, 'nbins': 1024, 'ignored_cols_by_name': '256, 382, 399, 50, 176, 407, 375, 113, 170, 313, 364, 33, 361, 426, 121, 371, 232, 327, 480, 75, 37, 312, 225, 195, 244, 406, 268, 230, 321, 257, 274, 197, 35, 501, 360, 72, 213, 79, 1, 466, 362, 160, 444, 437, 5, 59, 108, 454, 73, 374, 509, 337, 183, 252, 21, 314, 100, 200, 159, 379, 405, 367, 432, 181, 8, 420, 118, 284, 281, 465, 456, 359, 291, 330, 258, 523, 243, 487, 408, 392, 15, 231, 482, 481, 70, 171, 182, 31, 409, 492, 471, 53, 45, 448, 83, 527, 452, 350, 423, 93, 447, 130, 126, 54, 354, 169, 253, 49, 42, 431, 305, 498, 216, 189, 508, 122, 308, 228, 190, 293, 451, 63, 133, 304, 397, 425, 333, 19, 158, 391, 153, 282, 112, 64, 502, 7, 16, 469, 163, 136, 40, 99, 302, 264, 325, 434, 187, 311, 286, 278, 179, 109, 348, 287, 467, 400, 164, 384, 422, 43, 117, 91, 276, 211, 175, 329, 541, 438, 145, 534, 218, 177, 317, 222, 210, 162, 402, 98, 299, 245, 385, 233, 188, 516, 143, 13, 532, 429, 172, 455, 470, 518, 236, 296, 388, 468, 110, 395, 185, 25, 489, 196, 120, 435, 165, 168, 271, 74, 510, 36, 76, 208, 223, 270, 515, 421, 87, 66, 473, 220, 46, 486, 102, 38, 156, 48, 132, 331, 51, 403, 234, 23, 449, 341, 303, 410, 479, 203, 413, 512, 513, 9, 446, 511, 55, 6, 339, 418, 476, 178, 266, 22, 141, 259, 349, 86, 144, 34, 290, 326, 318, 519, 424, 127, 174, 472, 116, 17, 152, 280, 215, 514, 103, 377, 537, 373, 238, 47, 353, 428, 94, 214, 61, 123, 386, 351, 246, 411, 101, 249, 240, 520, 307, 288, 199, 147, 436, 77, 464, 414', 'source': u'test.hex', 'validation': u'test.hex', 'max_depth': 5} ### print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) h2o.beta_features = False xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if DO_KNOWN_FAIL: tryList = [ (1000000, 5, 'cD', 0, 320, 30), ] else: tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 80, 30), # (1000000, 5, 'cD', 0, 160, 30), # fails..don't do # (1000000, 5, 'cD', 0, 320, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] if DO_APPEND_KNOWN_FAIL2: tryList.append((1000000, 5, 'cD', 0, 160, 30), ) #tryList.append( # (1000000, 5, 'cD', 0, 320, 30), #) ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' if DO_KNOWN_FAIL: # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails # csvFilename = 'a1' # fails csvFilename = "syn_ddply_1Mx5_0_320.gz" bucket = "home-0xdiag-datasets" csvPathname = "standard/" + csvFilename minInt = 0 maxInt = 320 else: bucket = None csvFilename = 'syn_' + "binary" + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", ( maxInt - minInt) + 1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) for lll in range(1): # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) inspect = h2o_cmd.runInspect(key=hexKey) missingValuesList = h2o_cmd.infoFromInspect( inspect, csvFilename) self.assertEqual( missingValuesList, [], "a1 should have no NAs in parsed dataset: %s" % missingValuesList) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) #***************************************************************************************** # two columns. so worse case every combination of each possible value # only true if enough rows (more than the range?) maxExpectedGroups = ((maxInt - minInt) + 1)**2 # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=500) groups = execResult['num_rows'] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg= "groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a1dump = h2o_cmd.runInspect(key="a1") print "a1", h2o.dump_json(a1dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1") self.assertEqual( missingValuesList, [], "a1 should have no NAs: %s trial: %s" % (missingValuesList, trial)) #***************************************************************************************** execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=500) groups = execResult['num_rows'] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg= "groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a2dump = h2o_cmd.runInspect(key="a2") print "a2", h2o.dump_json(a2dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2") self.assertEqual( missingValuesList, [], "a2 should have no NAs: %s trial: %s" % (missingValuesList, trial)) #***************************************************************************************** # should be same answer in both cases execExpr = "sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=500) execExpr = "s=c(0); s=(a1!=a2)" (execResult1, result1) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=500) print "execResult", h2o.dump_json(execResult) #***************************************************************************************** # should never have any NAs in this result sdump = h2o_cmd.runInspect(key="s") print "s", h2o.dump_json(sdump) self.assertEqual( result, 1, "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s" % (FUNC_PHRASE, result, h2o.dump_json(execResult))) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_train_test(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if h2o.localhost: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex') ] else: files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex') ] # if I got to hdfs, it's here # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! print "Slow! exec is converting all imported keys?, not just what was parsed" execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = num_cols - 1 response = 378 print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 for max_depth in [5,10,20,40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': response, # 'ignored_cols': } print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** if doPredict: predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename print "This is crazy!" gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual=response, predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) if doPredict: xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rapids_vec_fail1(self): start = time.time() xList = [] eList = [] fList = [] bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): for trial in range(int(1e6), int(100e6), int(10e6)): # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))' execExpr = '(= !v (+ %v %v))' start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) xList.append(length) eList.append(elapsed1) fList.append(elapsed2) if 1 == 1: xLabel = 'vector length' eLabel = 'elapsed (create v)' fLabel = 'elapsed (v = v + v)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_plot_remove_keys(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 100, 'cG', 400), (200000, 100, 'cH', 400), (400000, 100, 'cI', 400), (800000, 100, 'cJ', 400), (1000000, 100, 'cK', 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES - 1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) iA = h2o_cmd.InspectObj(pA.parse_key) parseElapsed = pA.python_elapsed parse_key = pA.parse_key byteSize = pA.byteSize numRows = iA.numRows numCols = iA.numCols print parse_key, parseElapsed, byteSize, numRows, numCols labelList = iA.labelList node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(byteSize) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1 == 1: xLabel = 'byteSize' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if DO_KNOWN_FAIL: tryList = [(1000000, 5, "cD", 0, 320, 30)] else: tryList = [ # (1000000, 5, 'cD', 0, 10, 30), # (1000000, 5, 'cD', 0, 20, 30), # (1000000, 5, 'cD', 0, 40, 30), # (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, "cD", 0, 80, 30), (1000000, 5, "cD", 0, 160, 30), # fails..don't do # (1000000, 5, 'cD', 0, 320, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] if DO_APPEND_KNOWN_FAIL2: tryList.append((1000000, 5, "cD", 0, 160, 30)) tryList.append((1000000, 5, "cD", 0, 320, 30)) ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' if DO_KNOWN_FAIL: # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails # csvFilename = 'a1' # fails csvFilename = "syn_ddply_1Mx5_0_320.gz" bucket = "home-0xdiag-datasets" csvPathname = "standard/" + csvFilename minInt = 0 maxInt = 320 else: bucket = None csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname, "with range", (maxInt - minInt) + 1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) for lll in range(5): # PARSE train**************************************** hexKey = "r.hex" parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="local", hex_key=hexKey) inspect = h2o_cmd.runInspect(key=hexKey) missingValuesList = h2o_cmd.infoFromInspect(inspect, csvFilename) self.assertEqual( missingValuesList, [], "a1 should have no NAs in parsed dataset: %s" % missingValuesList ) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) # ***************************************************************************************** # two columns. so worse case every combination of each possible value # only true if enough rows (more than the range?) maxExpectedGroups = ((maxInt - minInt) + 1) ** 2 # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) groups = execResult["num_rows"] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt), ) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a1dump = h2o_cmd.runInspect(key="a1") print "a1", h2o.dump_json(a1dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1") self.assertEqual( missingValuesList, [], "a1 should have no NAs: %s trial: %s" % (missingValuesList, trial) ) # ***************************************************************************************** execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) groups = execResult["num_rows"] # this is a coarse comparision, statistically not valid for small rows, and certain ranges? h2o_util.assertApproxEqual( groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt), ) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) a2dump = h2o_cmd.runInspect(key="a2") print "a2", h2o.dump_json(a2dump) # should never have any NAs in this result missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2") self.assertEqual( missingValuesList, [], "a2 should have no NAs: %s trial: %s" % (missingValuesList, trial) ) # ***************************************************************************************** # should be same answer in both cases execExpr = "sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) execExpr = "s=c(0); s=(a1!=a2)" (execResult1, result1) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90) print "execResult", h2o.dump_json(execResult) # ***************************************************************************************** # should never have any NAs in this result sdump = h2o_cmd.runInspect(key="s") print "s", h2o.dump_json(sdump) self.assertEqual( result, 1, "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s" % (FUNC_PHRASE, result, h2o.dump_json(execResult)), ) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = "groups" eLabel = "ddplyElapsed" fLabel = "ddplyElapsed" eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_ddply_plot(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000000, 5, 'cD', 0, 10, 30), (1000000, 5, 'cD', 0, 20, 30), (1000000, 5, 'cD', 0, 30, 30), (1000000, 5, 'cD', 0, 40, 30), (1000000, 5, 'cD', 0, 50, 30), (1000000, 5, 'cD', 0, 70, 30), (1000000, 5, 'cD', 0, 100, 30), (1000000, 5, 'cD', 0, 130, 30), (1000000, 5, 'cD', 0, 160, 30), # (1000000, 5, 'cD', 0, 320, 30), # starts to fail here. too many groups? # (1000000, 5, 'cD', 0, 640, 30), # (1000000, 5, 'cD', 0, 1280, 30), ] ### h2b.browseTheCloud() xList = [] eList = [] fList = [] trial = 0 for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "with range", (maxInt-minInt)+1 write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE) # PARSE train**************************************** hexKey = 'r.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey) for resultKey, execExpr in initList: h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60) # do it twice..to get the optimal cached delay for time? execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")" start = time.time() (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) groups = execResult['num_rows'] maxExpectedGroups = ((maxInt - minInt) + 1) ** 2 h2o_util.assertApproxEqual(groups, maxExpectedGroups, rel=0.2, msg="groups %s isn't close to expected amount %s" % (groups, maxExpectedGroups)) ddplyElapsed = time.time() - start print "ddplyElapsed:", ddplyElapsed print "execResult", h2o.dump_json(execResult) # should be same answer in both cases execExpr = "d=sum(a1!=a2)==0" (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) print "execResult", h2o.dump_json(execResult) self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result) # xList.append(ntrees) trial += 1 # this is the biggest it might be ..depends on the random combinations # groups = ((maxInt - minInt) + 1) ** 2 xList.append(groups) eList.append(ddplyElapsed) fList.append(ddplyElapsed) if DO_PLOT: xLabel = 'groups' eLabel = 'ddplyElapsed' fLabel = 'ddplyElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec2_col_add(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' # csvPathname = 'airlines/year2013.csv' if localhost: # csvPathname = '1B/reals_100000x1000_15f.data' # csvPathname = '1B/reals_1000000x1000_15f.data' csvPathname = '1B/reals_1000000x1_15f.data' # csvPathname = '1B/reals_1B_15f.data' # csvPathname = '1B/reals_100M_15f.data' else: # csvPathname = '1B/reals_100000x1000_15f.data' # csvPathname = '1B/reals_1000000x1000_15f.data' csvPathname = '1B/reals_1000000x1_15f.data' # csvPathname = '1B/reals_1B_15f.data' # csvPathname = '1B/reals_100M_15f.data' hex_key = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) print "numRows:", inspect['numRows'] print "numCols:", inspect['numCols'] inspect = h2o_cmd.runInspect(key=hex_key, offset=-1) print "inspect offset = -1:", h2o.dump_json(inspect) xList = [] eList = [] fList = [] for execExpr in initList: execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) for trial in range(1000): for execExpr in exprList: # put the trial number into the temp for uniqueness execExpr = re.sub('Last.value', 'Last.value%s' % trial, execExpr) execExpr = re.sub(',1', ',%s' % trial, execExpr) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300) execTime = time.time() - start print 'exec took', execTime, 'seconds' c = h2o.nodes[0].get_cloud() c = c['nodes'] # print (h2o.dump_json(c)) k = [i['num_keys'] for i in c] v = [i['value_size_bytes'] for i in c] print "keys: %s" % " ".join(map(str,k)) print "value_size_bytes: %s" % " ".join(map(str,v)) # print "result:", result if ('r1' in execExpr) and (not 'apply' in execExpr): xList.append(trial) eList.append(execTime) if ('apply' in execExpr): fList.append(execTime) h2o.check_sandbox_for_errors() # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'time: r1[,1] = Last.value = r2', fLabel = 'time: apply(r1, 2, sum)', eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_rapids_cbind_vec(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # stop if > 1G (fails memory cleaner assetion maxx = 29 # for trial in range(maxx): # for trial in range(int(1e6),int(200e6),int(1e6)): for trial in [int(100e6)]: # length = (2 ** trial) # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) length = trial execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1) start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10) elapsed1 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # execExpr = '(= !v (+ (+ $v $v) (+ $v $v))' # cols = 100 xList = [] eList = [] fList = [] for trial2 in range(0, 16): # for trial2 in range(0, 10): # fails. Post size? # for trial2 in range(0, 16): col = 2 ** trial2 # assert col < 16384, "h2o can't take col == 16384 or more" vString = ' '.join(['$v' for x in range(col)]) execExpr = '(= !v2 (cbind %s))' % vString # FIX! check the colnames. 2 cols get C1 and C10? odd # try: start = time.time() execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40) elapsed2 = time.time() - start if execResult['num_rows']: keys.append(execExpr) # except: # elapsed2 = 0 # h2p.red_print("ERROR: col = %s failed" % col) if 1==0: start = time.time() execExpr = '(sum $v2 $TRUE)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60) elapsed1 = time.time() - start # xList.append(length) xList.append(col) eList.append(elapsed1) fList.append(elapsed2) if 1==1: xLabel = 'col' eLabel = 'elapsed (sum)' fLabel = 'elapsed (cbind cols)' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i==(TRIES-1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) print "rfView:", h2o.dump_json(rfView) rfView["drf_model"] = rfView.pop("speedrf_model") rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict") # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_plot_remove_keys(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100000, 50, 'cG', 400, 400), (200000, 50, 'cH', 400, 400), (400000, 50, 'cI', 400, 400), (800000, 50, 'cJ', 400, 400), (1000000, 50, 'cK', 400, 400), ] xList = [] eList = [] fList = [] for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) NUM_CASES = h2o_util.fp_format() sel = random.randint(0, NUM_CASES - 1) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel) start = time.time() print csvFilename, "parse starting" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) parseElapsed = time.time() - start print "Parse only:", parseResult[ 'destination_key'], "took", parseElapsed, "seconds" h2o.check_sandbox_for_errors() # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount)) parsedBytes = inspect['byteSize'] node = h2o.nodes[0] print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?" start = time.time() node.remove_key(hex_key, timeoutSecs=30) removeElapsed = time.time() - start print "Deleting", hex_key, "took", removeElapsed, "seconds" # xList.append(ntrees) xList.append(parsedBytes) eList.append(parseElapsed) fList.append(removeElapsed) # just plot the last one if 1 == 1: xLabel = 'parsedBytes' eLabel = 'parseElapsed' fLabel = 'removeElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() if localhost: tryList = [(10000, 100, "cA", 300)] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, "cD", 300), (10000, 200, "cE", 300), (10000, 300, "cF", 300), (10000, 400, "cG", 300), (10000, 500, "cH", 300), (10000, 1000, "cI", 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" hdrFilename = "hdr_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename hdrPathname = SYNDATASETS_DIR + "/" + hdrFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE train**************************************** h2o.beta_features = False # turn off beta_features start = time.time() xList = [] eList = [] fList = [] modelKey = "GBMModelKey" # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseTrainResult = h2i.import_parse( bucket=None, path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False, ) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseTrainResult['destination_key'] for h2o" parseTrainResult["destination_key"] = trainKey elapsed = time.time() - start print "train parse end on ", csvPathname, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "train parse result:", parseTrainResult["destination_key"] # Logging to a benchmark file algo = "Parse" l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed ) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseTrainResult["destination_key"]) print "\n" + csvPathname, " num_rows:", "{:,}".format( inspect["num_rows"] ), " num_cols:", "{:,}".format(inspect["num_cols"]) num_rows = inspect["num_rows"] num_cols = inspect["num_cols"] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) # GBM(train iterate)**************************************** ntrees = 5 prefixList = ["A", "B", "C", "D", "E", "F", "G", "H"] # for max_depth in [5,10,20,40]: for max_depth in [5, 10, 20]: # PARSE a new header**************************************** print "Creating new header", hdrPathname prefix = prefixList.pop(0) write_syn_header(hdrPathname, rowCount, colCount, prefix) # upload and parse the header to a hex h2o.beta_features = False # can't put with fvec yet hdr_hex_key = prefix + "_hdr.hex" parseHdrResult = h2i.import_parse( bucket=None, path=hdrPathname, schema="put", header=1, # REQUIRED! otherwise will interpret as enums hex_key=hdr_hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False, ) # Set Column Names (before autoframe is created) h2o.nodes[0].set_column_names(target=hex_key, copy_from=hdr_hex_key) # GBM print "The response col name is changing each iteration, since we're parsing a new header" params = { "learn_rate": 0.2, "nbins": 1024, "ntrees": ntrees, "max_depth": max_depth, "min_rows": 10, "response": prefix + "_response", "ignored_cols_by_name": None, } print "Using these parameters for GBM: ", params kwargs = params.copy() h2o.beta_features = True trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM( parseResult=parseTrainResult, noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs ) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname # Logging to a benchmark file algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth) l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed ) print l h2o.cloudPerfH2O.message(l) gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView["gbm_model"]["errs"][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView["gbm_model"]["cms"][-1]["_arr"] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm) print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrongTrain) fList.append(trainElapsed) # works if you delete the autoframe ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe') h2o.beta_features = False # just plot the last one if DO_PLOT: xLabel = "max_depth" eLabel = "pctWrong" fLabel = "trainElapsed" eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_GBM_manyfiles_train_test(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' if localhost: files = [ # None forces numCols to be used. assumes you set it from Inspect # problems with categoricals not in the train data set? (warnings in h2o stdout) ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex') # just use matching ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex') ] else: files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex') ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz h2b.browseTheCloud() for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + trainFilename parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "train parse result:", parseTrainResult['destination_key'] ### h2o_cmd.runSummary(key=parsTraineResult['destination_key']) inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Parse (test)**************************************** parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "test parse result:", parseTestResult['destination_key'] # Make col 378 it something we can do binomial regression on! execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60) # Note ..no inspect of test data here..so translate happens later? # GBM (train iterate)**************************************** # if not response: # response = numCols - 1 # response = 378 response = 'C379' # randomly ignore a bunch of cols, just to make it go faster x = range(numCols) del x[response] ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x), random.sample(x, 300))) print "Using the same response %s for train and test (which should have a output value too)" % response ntrees = 10 # ignore 200 random cols (not the response) for max_depth in [5, 40]: params = { 'learn_rate': .2, 'nbins': 1024, 'ntrees': ntrees, 'max_depth': max_depth, 'min_rows': 10, 'response': 'C' + str(response), 'ignored_cols_by_name': ignored_cols_by_name, } ### print "Using these parameters for GBM: ", params kwargs = params.copy() # GBM train**************************************** trainStart = time.time() gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs) # hack trainElapsed = time.time() - trainStart print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) # GBM test**************************************** predictKey = 'Predict.hex' ### h2o_cmd.runInspect(key=parseTestResult['destination_key']) start = time.time() gbmTestResult = h2o_cmd.runPredict( data_key=parseTestResult['destination_key'], model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix( actual=parseTestResult['destination_key'], vactual='C' + str(response), predict=predictKey, vpredict='predict', # choices are 0 and 'predict' ) # errrs from end of list? is that the last tree? # all we get is cm cm = gbmPredictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm is really NAs, not CM" print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) # xList.append(ntrees) xList.append(max_depth) eList.append(pctWrong) fList.append(trainElapsed) xLabel = 'max_depth' eLabel = 'pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rf_covtype_fvec(self): importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow trial += 1 paramDict['destination_key'] = 'RFModel_' + str(trial) if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntrees'] * 200 start = time.time() rfFirstResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) # print h2o.dump_json(rfFirstResult) # FIX! are these already in there? rfView = {} rfView['data_key'] = hex_key rfView['model_key'] = kwargs['destination_key'] rfView['ntrees'] = kwargs['ntrees'] h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=5) trainElapsed = time.time() - start print "rf train end on ", csvTrainPathname, 'took', trainElapsed, 'seconds' ### print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntrees = rfView['ntrees'] rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, noPoll=False, doSimpleCheck=False) h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5) # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False) ## print "rfView:", h2o.dump_json(rfView) # "N":1, # "errs":[0.25,0.1682814508676529], # "testKey":"syn_binary_10000x10.hex", # "cm":[[3621,1399],[1515,3465]]}} rf_model = rfView['drf_model'] cm = rf_model['cm'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRF2View(rfv=rfView) # "treeStats": { # "maxDepth": 15, # "maxLeaves": 1715, # "meanDepth": 15, # "meanLeaves": 421, # "minDepth": 15, # "minLeaves": 5 # }, # "varimp": null # FIX! should update this expected classification error ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) # xList.append(ntrees) # paramDict['max_depth'] # paramDict['nbins'] # paramDict['mtries'] # paramDict['ntrees'] # eList.append(errs[-1]) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)