def test_parse_manyfile_hack(self): for trial in range(2): importFolderPath = "/home/0xdiag/datasets/manyfiles-nflx-gz" importList = [] maxi = 50 # 4-9 don't exist? for i in range(10, 10+maxi+1): csvFilename = "file_%s.dat.gz" % i csvPathname = importFolderPath + "/" + csvFilename importResult = h2o.n0.import_files(path=csvPathname) # just 1! import_key = importResult['keys'][0] assert len(importResult['keys'])==1 assert len(importResult['files'])==1 assert len(importResult['fails'])==0 assert len(importResult['dels'])==0 importList.append(import_key) timeoutSecs = 800 parseResult = h2o.n0.parse(key=importList, timeoutSecs=timeoutSecs) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) assert numRows == (maxi * 100000) assert numCols == 542
def test_parse_rand_utf8(self): SYNDATASETS_DIR = h2o.make_syn_dir() print "HACK: reduce rows to 10 for debug" tryList = [ # do two cols to detect bad eol behavior (10, 2, 'cA', 120), (10, 2, 'cG', 120), (10, 2, 'cH', 120), ] print "What about messages to log (INFO) about unmatched quotes (before eol)" # got this ..trying to avoid for now # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED=SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', check_header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "parseResult:", dump_json(parseResult) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # FIX! check type? # print "inspect:", h2o.dump_json(inspect) self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount)) self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))
def test_parse_rand_utf8(self): SYNDATASETS_DIR = h2o.make_syn_dir() print "HACK: reduce rows to 10 for debug" tryList = [ # do two cols to detect bad eol behavior (10, 2, 'cA', 120), (10, 2, 'cG', 120), (10, 2, 'cH', 120), ] print "What about messages to log (INFO) about unmatched quotes (before eol)" # got this ..trying to avoid for now # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED=SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', checkHeader=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "parseResult:", dump_json(parseResult) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # FIX! check type? # print "inspect:", h2o.dump_json(inspect) self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount)) self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))
def test_kmeans_benign(self): importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, timeoutSecs=180, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) expected = [ ([8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None), ([33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None), ([27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None), ([26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01, 0.01) # loop, to see if we get same centers # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(5): kmeansSeed = random.randint(0, sys.maxint) # kmeansSeed = 6655548259421773879 parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'score_each_iteration': False, 'K': 4, 'max_iters': 50, 'normalize': False, 'seed': kmeansSeed, 'init': 'PlusPlus', } model_key = 'benign_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) modelResult = h2o.n0.models(key=model_key) # this prints too tuplesSorted, iters, mse, names = \ h2o_kmeans.simpleCheckKMeans(self, modelResult, parameters, numRows, numColsUsed, labelListUsed) h2o_cmd.runStoreView() # zip with * is it's own inverse here. It's sorted by centers for easy comparisons ids, mses, rows, clusters = zip(*tuplesSorted)
def test_parse_manyfile_hack(self): for trial in range(2): importFolderPath = "/home/0xdiag/datasets/manyfiles-nflx-gz" importList = [] maxi = 50 # 4-9 don't exist? for i in range(10, 10 + maxi + 1): csvFilename = "file_%s.dat.gz" % i csvPathname = importFolderPath + "/" + csvFilename importResult = h2o.n0.import_files(path=csvPathname) # just 1! import_key = importResult['keys'][0] assert len(importResult['keys']) == 1 assert len(importResult['files']) == 1 assert len(importResult['fails']) == 0 assert len(importResult['dels']) == 0 importList.append(import_key) timeoutSecs = 800 parseResult = h2o.n0.parse(key=importList, timeoutSecs=timeoutSecs) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspectResult) assert numRows == (maxi * 100000) assert numCols == 542
def test_GBM_covtype_train_test(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'standard' trainFilename = 'covtype.shuffled.90pct.data' train_key = 'covtype.train.hex' model_key = 'GBMModelKey' timeoutSecs = 1800 csvPathname = importFolderPath + "/" + trainFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=train_key, timeoutSecs=timeoutSecs) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) labelListUsed = list(labelList) numColsUsed = numCols parameters = { 'validation_frame': train_key, 'ignored_columns': None, 'score_each_iteration': True, 'response_column': 'C55', 'do_classification': True, # 'balance_classes': # 'max_after_balance_size': 'ntrees': 2, 'max_depth': 10, 'min_rows': 3, 'nbins': 40, 'learn_rate': 0.2, # FIX! doesn't like it? # 'loss': 'Bernoulli', # FIX..no variable importance for GBM yet? 'variable_importance': False, # 'seed': } model_key = 'benign_gbm.hex' bmResult = h2o.n0.build_model( algo='gbm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=60) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 100, 'cF', 600), (100, 5000, 'cF', 600), (100, 10000, 'cF', 600), # (100, 12000, 'cF', 600), # (100, 15000, 'cF', 600), # (100, 17000, 'cF', 600), (100, 20000, 'cF', 600), (100, 40000, 'cF', 600), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) start = time.time() print "Parse start:", csvPathnamegz parseResult = h2i.import_parse(path=csvPathnamegz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) if DOSUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parse_key, "took", time.time() - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(key=parse_key, timeoutSecs=timeoutSecs) print "Inspect:", parse_key, "took", time.time() - start, "seconds" missingValuesList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathnamegz, \ "\n numRows:", "{:,}".format(numRows), \ "\n numCols:", "{:,}".format(numCols) self.assertEqual(len(missingValuesList), 0, "Don't expect any missing values. These cols had some: %s" % missingValuesList) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount))
def test_50_nongz_fvec(self): avgMichalSize = 237270000 * 2 bucket = 'home-0xdiag-datasets' importFolderPath = "many_many" print "Using non-gz'ed files in", importFolderPath csvFilenameList= [ ("*.dat", "file_18_A.dat", 18 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ (n, 10, 9, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1, iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key('p') for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn('==', c, pKey[:, i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn('&', cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign('b', [1, 2, 3]) # could also append 1 col at a time, by assigning to the next col number? Assign('a', Cbind(['b' for i in range(colCount)])) for eKey in eKeys: Assign(eKey, 'a') ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] if 1 == 1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ (n, 10, 9, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key('p') for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn('==', c, pKey[:,i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn('&', cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign('b', [1,2,3]) # could also append 1 col at a time, by assigning to the next col number? Assign('a', Cbind(['b' for i in range(colCount)])) for eKey in eKeys: Assign(eKey, 'a') ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): start = time.time() execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) print r, 'exec took', time.time() - start, 'seconds' print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_GLM_basic_1(self): importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, timeoutSecs=180, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) expected = [] allowedDelta = 0 # loop, to see if we get same centers # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': '[ID]', 'score_each_iteration': True, 'response_column': 'CAPSULE', # FIX! when is this needed? redundant for binomial? 'do_classification': True, 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'higher_accuracy': True, 'use_all_factor_levels': False, # NPE with n_folds 2? 'n_folds': 1, } model_key = 'prostate_glm.hex' glmResult = h2o.n0.build_model( algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) gr = self.GLMOutput(glmResult) for k,v in gr: if k != 'parameters': print "gr", k, dump_json(v) modelResult = h2o.n0.models(key=model_key) mr = self.GLMOutput(modelResult['models'][0]['output']) for k,v in mr: if k != 'parameters': print "mr", k, dump_json(v) cmmResult = h2o.n0.compute_model_metrics( model=model_key, frame=parse_key, timeoutSecs=60) print "cmmResult", dump_json(cmmResult) mmResult = h2o.n0.model_metrics( model=model_key, frame=parse_key, timeoutSecs=60) print "mmResult", dump_json(mmResult) # this prints too # tuplesSorted, iters, mse, names = \ # h2o_glm.simpleCheckGLM(self, modelResult, parameters, numRows, numColsUsed, labelListUsed) h2o_cmd.runStoreView()
def test_w2v_basic_2(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 100 tryList = [ # (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) hex_key = "not_used.hex" # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, checkHeader=1, delete_on_done = 0, timeoutSecs=180, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) src_key = h2i.find_key('syn_.*csv') # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # Frame False [] 'ignored_columns': None, # string[] None [] 'score_each_iteration': None, # boolean false [] 'minWordFreq': 1, # int 5 [] 'wordModel': 'CBOW', # enum [u'CBOW', u'SkipGram'] 'normModel': 'NegSampling', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 1,# int 5 [] 'vecSize': 10, # int 100 'windowSize': 2, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model( algo='word2vec', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics( model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_rapids_overloaded_opr(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) Assign('s1', Seq(range(5))) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created # first try as object, then method Assign('s2', Fcn('c', Seq(range(5)))) # just combine Assign('s3', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s3') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 Assign('s2', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 # can't have sequence of sequences? # make sure key is created with c() f = Fcn( 'c', Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s1', f) f = Col( Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s2', f) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 313 assert numCols == 1 print "Now trying to do the functions with the alternate overloaded operators" data_key = Key(parse_key) result_key = Key() # what triggers immediate operation at h2o # as opposed to an object within a function result_key.frame = 'a1' result_key <<= data_key[Seq(range(1, 4)), :] result_key.frame = 'a2' result_key <<= data_key[Seq(range(1, 4)), :] result_key.frame = 'a3' result_key <<= data_key[Seq(range(1, 4)), :] result_key.frame = 'a4' result_key <<= data_key[Seq(range(1, 4)), 0:1] result_key.frame = 'a5' result_key <<= data_key[Seq(range(1, 4)), 0:1] result_key.frame = 'a6' result_key <<= data_key[[1, 2, 3], 1] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_rapids_overloaded_opr(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) Assign('s1', Seq(range(5)) ) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created # first try as object, then method Assign('s2', Fcn('c', Seq(range(5)) )) # just combine Assign('s3', Col(Seq(range(5)) )) inspect = h2o_cmd.runInspect(key='s3') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==5 assert numCols==1 Assign('s2', Col(Seq(range(5))) ) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==5 assert numCols==1 # can't have sequence of sequences? # make sure key is created with c() f = Fcn('c', Seq(Colon(99,400), "#2", 1, range(1,5), range(7,10), range(50,52) )) Assign('s1', f) f = Col(Seq(Colon(99,400), "#2", 1, range(1,5), range(7,10), range(50,52) )) Assign('s2', f) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==313 assert numCols==1 print "Now trying to do the functions with the alternate overloaded operators" data_key = Key(parse_key) result_key = Key() # what triggers immediate operation at h2o # as opposed to an object within a function result_key.frame = 'a1' result_key <<= data_key[Seq(range(1,4)), :] result_key.frame = 'a2' result_key <<= data_key[Seq(range(1,4)), :] result_key.frame = 'a3' result_key <<= data_key[Seq(range(1,4)), :] result_key.frame = 'a4' result_key <<= data_key[Seq(range(1,4)), 0:1] result_key.frame = 'a5' result_key <<= data_key[Seq(range(1,4)), 0:1] result_key.frame = 'a6' result_key <<= data_key[[1,2,3], 1] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_hdfs_cdh5(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle" ("and-testing.data", 60), ### "arcene2_train.both", ### "arcene_train.both", ### "bestbuy_test.csv", ("covtype.data", 60), ("covtype4x.shuffle.data", 60), # "four_billion_rows.csv", ("hhp.unbalanced.012.data.gz", 60), ("hhp.unbalanced.data.gz", 60), ("leads.csv", 60), # ("covtype.169x.data", 1200), ("prostate_long_1G.csv", 200), ("airlines_all.csv", 1200), ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() trial = 0 print "try importing /tmp2" d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000) for (csvFilename, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a.hex" csvPathname = "datasets/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs parse of", csvPathname, "took", time.time() - start, 'secs' numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingValuesListA, labelListA, numRowsA, numColsA = h2o_cmd.infoFromInspect(inspectResult) if DO_EXPORT: start = time.time() print "Saving", csvFilename, 'to HDFS' print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)" print "Unique per-user to avoid permission issues" username = getpass.getuser() csvPathname = "tmp2/a%s.%s.csv" % (trial, username) # reuse the file name to avoid running out of space csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username) path = "hdfs://"+ h2o.nodes[0].hdfs_name_node + "/" + csvPathname h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs) print "export_files of", hex_key, "to", path, "took", time.time() - start, 'secs' trial += 1 print "Re-Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a2.hex" time.sleep(2) d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000) print "hdfs re-parse of", csvPathname, "took", time.time() - start, 'secs'
def test_kmeans_prostate(self): importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, timeoutSecs=180, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) # loop, to see if we get same centers expected = [ ([0.37,65.77,1.07,2.23,1.11,10.49,4.24,6.31], 215, 36955), ([0.36,66.44,1.09,2.21,1.06,10.84,34.16,6.31], 136, 46045), ([0.83,66.17,1.21,2.86,1.34,73.30,15.57,7.31], 29, 33412), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) labelListUsed = list(labelList) labelListUsed.remove('ID') numColsUsed = numCols - 1 for trial in range(5): # kmeansSeed = random.randint(0, sys.maxint) # actually can get a slightly better error sum with a different seed # this seed gets the same result as scikit # kmeansSeed = 6655548259421773879 kmeansSeed = random.randint(0, sys.maxint) parameters = { 'validation_frame': parse_key, 'ignored_columns': '[ID]', 'score_each_iteration': False, 'K': 3, 'max_iters': 500, 'normalize': False, 'seed': kmeansSeed, 'init': 'PlusPlus', } model_key = 'prostate_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) modelResult = h2o.n0.models(key=model_key) h2o_cmd.runStoreView() tuplesSorted, iters, mse, names = \ h2o_kmeans.simpleCheckKMeans(self, modelResult, parameters, numRows, numColsUsed, labelListUsed) ids, mses, rows, clusters = zip(*tuplesSorted)
def test_DL_basic(self): importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, timeoutSecs=180, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) expected = [] allowedDelta = 0 # no cols ignored labelListUsed = list(labelList) labelListUsed.remove('STR') numColsUsed = numCols - 1 for trial in range(1): parameters = { 'validation_frame': parse_key, # Frame None 'ignored_columns': '[STR]', # string[] None 'score_each_iteration': None, # boolean false 'response_column': 'FNDX', # string None 'do_classification': None, # boolean false 'balance_classes': None, # boolean false 'max_after_balance_size': None, # float Infinity 'n_folds': None, # int 0 'keep_cross_validation_splits': None, # boolean false 'checkpoint': None, # Key None 'override_with_best_model': None, # boolean true 'expert_mode': None, # boolean false 'autoencoder': None, # boolean false 'use_all_factor_levels': None, # boolean true # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout'] 'activation': None, # enum Rectifier 'hidden': None, # int[] [200, 200] 'epochs': None, # double 10.0 'train_samples_per_iteration': None, # long -2 'target_ratio_comm_to_comp': None, # double 0.02 'seed': None, # long 1679194146842485659 'adaptive_rate': None, # boolean true 'rho': None, # double 0.99 'epsilon': None, # double 1.0E-8 'rate': None, # double 0.005 'rate_annealing': None, # double 1.0E-6 'rate_decay': None, # double 1.0 'momentum_start': None, # double 0.0 'momentum_ramp': None, # double 1000000.0 'momentum_stable': None, # double 0.0 'nesterov_accelerated_gradient': None, # boolean true 'input_dropout_ratio': None, # double 0.0 'hidden_dropout_ratios': None, # double[] None (this can grid?) 'l1': None, # double 0.0 'l2': None, # double 0.0 'max_w2': None, # float Infinity 'initial_weight_distribution': None, # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal'] 'initial_weight_scale': None, # double 1.0 'loss': None, # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy'] 'score_interval': None, # double 5.0 'score_training_samples': None, # long 10000 'score_validation_samples': None, # long 0 'score_duty_cycle': None, # double 0.1 'classification_stop': None, # double 0.0 'regression_stop': None, # double 1.0E-6 'quiet_mode': None, # boolean false 'max_confusion_matrix_size': None, # int 20 'max_hit_ratio_k': None, # int 10 'balance_classes': None, # boolean false 'class_sampling_factors': None, # float[] None 'max_after_balance_size': None, # float Infinity 'score_validation_sampling': None, # enum Uniform [u'Uniform', u'Stratified'] 'diagnostics': None, # boolean true 'variable_importances': None, # boolean false 'fast_mode': None, # boolean true 'ignore_const_cols': None, # boolean true 'force_load_balance': None, # boolean true 'replicate_training_data': None, # boolean false 'single_node_mode': None, # boolean false 'shuffle_training_data': None, # boolean false 'missing_values_handling': None, # enum MeanImputation [u'Skip', u'MeanImputation'] 'sparse': None, # boolean false 'col_major': None, # boolean false 'average_activation': None, # double 0.0 'sparsity_beta': None, # double 0.0 } model_key = 'benign_dl.hex' bmResult = h2o.n0.build_model( algo='deeplearning', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_GLM_basic_2(self): importFolderPath = "logreg" csvFilename = "prostate.csv" hex_key = "prostate.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, timeoutSecs=180, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) expected = [] allowedDelta = 0 labelListUsed = list(labelList) labelListUsed.remove('ID') labelListUsed.remove('CAPSULE') numColsUsed = numCols - 2 for trial in range(1): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? parameters = { 'validation_frame': parse_key, 'ignored_columns': '[ID]', 'score_each_iteration': True, 'response_column': 'CAPSULE', # FIX! when is this needed? redundant for binomial? 'do_classification': True, 'balance_classes': False, 'max_after_balance_size': None, 'standardize': False, 'family': 'binomial', 'link': None, 'tweedie_variance_power': None, 'tweedie_link_power': None, 'alpha': '[1e-4]', 'lambda': '[0.5]', 'prior1': None, 'lambda_search': None, 'nlambdas': None, 'lambda_min_ratio': None, 'higher_accuracy': True, 'use_all_factor_levels': False, # NPE with n_folds 2? 'n_folds': 1, } model_key = 'prostate_glm.hex' bmResult = h2o.n0.build_model( algo='glm', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.newSimpleCheckGLM(self, model, parameters, labelList, labelListUsed) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult, 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_mixed_int_enum_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # this should be a sorted list for comparing to hbrk in the histogram in h2o summary? enumList = ['abc', 'def', 'ghi'] # numbers 1 and 2 may not be counted as NAs correctly? what about blank space? intList = [0, 1, 2, ''] expectedList = ['abc', 'def', 'ghi'] tryList = [ # not sure about this case # some of the cases interpret as ints now (not as enum) (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1], intList[0:2], False), # colname, (min, COLS5th, 50th, 75th, max) (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2], intList[0:1], True), # fails this case (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1], intList[0:1], True), (ROWS, COLS, 'd.hex', enumList[0:], expectedList[0:], intList[0:1], True), (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2], intList[0:2], True), # this case seems to fail (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1], intList[0:2], True), # this seems wrong also (ROWS, COLS, 'g.hex', enumList[0:], expectedList[0:], intList[0:2], True), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) class Column(object): def __init__(self, column): assert isinstance(column, dict) for k, v in column.iteritems(): setattr(self, k, v) # achieves self.k = v x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices) parseResult = h2i.import_parse(path=csvPathname, schema='put', check_header=0, hex_key=hex_key, timeoutSecs=10, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) print "numRows:", numRows, "numCols:", numCols inspect = h2o_cmd.runInspect(None, hex_key) print "\nTrial:", trial, csvFilename # this summary only does one column? # assert colCount == len(columns), "%s %s" % (colCount, len(columns)) for i in range(colCount): summaryResult = h2o_cmd.runSummary(key=hex_key, column="C" + str(i + 1)) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) # columns = summaryResult['frames'][0]['columns'] co = Column(summaryResult) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] coNameList = [ 'co.base', 'len(co.bins)', 'len(co.data)', 'co.domain', 'co.label', 'co.maxs', 'co.mean', 'co.mins', 'co.missing', 'co.ninfs', 'co.pctiles', 'co.pinfs', 'co.precision', 'co.sigma', 'co.str_data', 'co.stride', 'co.type', 'co.zeros', ] for c, n in zip(coList, coNameList): print n + ":", c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals( co.mean) # what is precision. -1? # This can go to NaN (string) with big numbers # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) # can be None if col is all NA # print "FIX! hacking the co.pctiles because it's short by two" # pctiles = [0] + co.pctiles + [0] assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % ( co.zeros, numRows) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( co.type, 'enum', "Expecting co.type %s to be 'enum' for %s co label %s" % (co.type, i, co.label)) if ENABLE_ASSERTS and resultIsEnum: # not always there cardinality = len(co.domain) self.assertEqual( cardinality, len(enumChoices), msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices))) # assume I create the list above in the same order that h2o will show the order. sorted? if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(co.bins, enumChoices) hcntTotal = sum(co.bins) numRowsCreated = rowCount + len(intChoices) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i]) self.assertEqual(numRows, numRowsCreated, msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated)) nacnt = co.missing if ENABLE_ASSERTS and resultIsEnum: self.assertEqual( nacnt, expectedNaCnt[i], "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt)) # FIX! no checks for the case where it got parsed as int column! trial += 1
def test_summary2_exp(self): SYNDATASETS_DIR = h2o.make_syn_dir() LAMBD = random.uniform(0.005, 0.5) tryList = [ # co.label, (min, 25th, 50th, 75th, max) # parse setup error # (1, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), (5, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (10, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (100, 1, 'x.hex', 1, 20000, ['C1', None, None, None, None, None]), # (1000, 1, 'x.hex', -5000, 0, ['C1', None, None, None, None, None]), # (10000, 1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]), # (100000, 1, 'x.hex', -1, 1, ['C1', None, None, None, None, None]), # (1000000, 1, 'A.hex', 1, 100, ['C1', None, None, None, None, None]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 class Column(object): def __init__(self, column): assert isinstance(column, dict) for k,v in column.iteritems(): setattr(self, k, v) # achieves self.k = v for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname, "lambd:", LAMBD (expectedMin, expectedMax) = write_syn_dataset(csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE) print "expectedMin:", expectedMin, "expectedMax:", expectedMax maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta expected[1] = expectedMin expected[5] = expectedMax csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvFilename # column 0? summaryResult = h2o_cmd.runSummary(key=hex_key, column='C1') h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult)) # default_pctiles # isText # rows # off # key # checksum # only one column columns = summaryResult['frames'][0]['columns'] default_pctiles = summaryResult['frames'][0]['default_pctiles'] co = Column(columns[0]) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" pctiles = [0] + co.pctiles + [0] # the thresholds h2o used, should match what we expected if expected[0]: self.assertEqual(co.label, expected[0]) if expected[1]: h2o_util.assertApproxEqual(co.mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctiles[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctiles[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctiles[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(co.maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected') # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 pt = h2o_util.twoDecimals(pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too compareActual = mn[0], pt[3], pt[5], pt[7], mx[0] h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):", compareActual) print "co.label:", co.label, "co.maxs (2 places):", mx print "co.label:", co.label, "co.mins (2 places):", mn trial += 1 h2o.nodes[0].remove_all_keys() scipyCol = 0 print "h2oSummary2MaxErr", maxErr if co.label!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=False, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, h2oSummary2=pctiles[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, # h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, )
def test_mixed_int_enum_many(self): SYNDATASETS_DIR = h2o.make_syn_dir() # this should be a sorted list for comparing to hbrk in the histogram in h2o summary? enumList = ['abc', 'def', 'ghi'] # numbers 1 and 2 may not be counted as NAs correctly? what about blank space? intList = [0, 1, 2, ''] expectedList = [ 'abc', 'def', 'ghi'] tryList = [ # not sure about this case # some of the cases interpret as ints now (not as enum) (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1], intList[0:2], False), # colname, (min, COLS5th, 50th, 75th, max) (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2], intList[0:1], True), # fails this case (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1], intList[0:1], True), (ROWS, COLS, 'd.hex', enumList[0: ], expectedList[0: ], intList[0:1], True), (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2], intList[0:2], True), # this case seems to fail (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1], intList[0:2], True), # this seems wrong also (ROWS, COLS, 'g.hex', enumList[0: ], expectedList[0: ], intList[0:2], True), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) class Column(object): def __init__(self, column): assert isinstance(column, dict) for k,v in column.iteritems(): setattr(self, k, v) # achieves self.k = v x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList: # max error = half the bin size? SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices) parseResult = h2i.import_parse(path=csvPathname, schema='put', checkHeader=0, hex_key=hex_key, timeoutSecs=10, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) print "numRows:", numRows, "numCols:", numCols inspect = h2o_cmd.runInspect(None, hex_key) print "\nTrial:", trial, csvFilename # this summary only does one column? # assert colCount == len(columns), "%s %s" % (colCount, len(columns)) for i in range(colCount): summaryResult = h2o_cmd.runSummary(key=hex_key, column="C" + str(i+1)) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) columns = summaryResult['frames'][0]['columns'] co = Column(columns[0]) # how are enums binned. Stride of 1? (what about domain values) coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros, ] coNameList = [ 'co.base', 'len(co.bins)', 'len(co.data)', 'co.domain', 'co.label', 'co.maxs', 'co.mean', 'co.mins', 'co.missing', 'co.ninfs', 'co.pctiles', 'co.pinfs', 'co.precision', 'co.sigma', 'co.str_data', 'co.stride', 'co.type', 'co.zeros', ] for c,n in zip(coList, coNameList): print n+":", c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) # what is precision. -1? # This can go to NaN (string) with big numbers # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) # can be None if col is all NA # print "FIX! hacking the co.pctiles because it's short by two" # pctiles = [0] + co.pctiles + [0] assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % (co.zeros, numRows) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(co.type, 'Enum', "trial %s: Expecting type to be Enum for %s col colname %s" % (trial, i, colname)) if ENABLE_ASSERTS and resultIsEnum: # not always there cardinality = len(co.domain) self.assertEqual(cardinality, len(enumChoices), msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices))) # assume I create the list above in the same order that h2o will show the order. sorted? if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(co.bins, enumChoices) hcntTotal = sum(co.bins) numRowsCreated = rowCount + len(intChoices) if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i]) self.assertEqual(numRows, numRowsCreated, msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated)) nacnt = co.missing if ENABLE_ASSERTS and resultIsEnum: self.assertEqual(nacnt, expectedNaCnt[i], "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt)) # FIX! no checks for the case where it got parsed as int column! trial += 1
def test_exec2_xorsum(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 1, 'r1', 0, 10, None), ] for trial in range(10): ullResultList = [] for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # dynamic range of the data may be useful for estimating error maxDelta = expectedMax - expectedMin csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname (expectedUllSum, expectedFpSum) = write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum) expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) assert parse_key == hex_key assert numCols == colCount assert numRows == rowCount inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # looking at the 8 bytes of bits for the h2o doubles # xorsum will zero out the sign and exponent for execExpr in exprList: for r in range(10): if 1==0: execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30) fpResult = execResult['scalar'] else: (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="h")) # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300) # print dump_json(h2o.n0.frames(key="r1")) print r, "execResult:", h2o.dump_json(execResult) h2o_cmd.runStoreView() ullResult = h2o_util.doubleToUnsignedLongLong(fpResult) ullResultList.append((ullResult, fpResult)) print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum) # allow diff of the lsb..either way # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3): if ullResult!=expectedUllSum: raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum)) print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \ (ullResult, expectedUllSum) h2o.check_sandbox_for_errors() print "first result was from a sum. others are xorsum" print "ullResultList:" for ullResult, fpResult in ullResultList: print "%30s" % "ullResult (0.16x):", "0x%0.16x %s" % (ullResult, fpResult) print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x %s" % (expectedUllSum, expectedUllSumAsDouble) print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x %s" % (expectedFpSumAsLongLong, expectedFpSum)
def test_quant_cmp_uniform(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (5*ROWS, 1, 'x.hex', 1, 20000, ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]), (5*ROWS, 1, 'x.hex', -5000, 0, ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]), (1*ROWS, 1, 'x.hex', -100000, 100000, ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]), (1*ROWS, 1, 'x.hex', -1, 1, ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]), (1*ROWS, 1, 'A.hex', 1, 100, ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]), (1*ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]), (1*ROWS, 1, 'B.hex', 1, 10000, ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]), (1*ROWS, 1, 'B.hex', -100, 100, ['C1', -100.10, -50.0, 0.85, 51.7, 100,00]), (1*ROWS, 1, 'C.hex', 1, 100000, ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]), (1*ROWS, 1, 'C.hex', -101, 101, ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]), ] timeoutSecs = 10 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList: # max error = half the bin size? colname = expected[0] maxDelta = ((expectedMax - expectedMin)/1000.0) / 2.0 # add 5% for fp errors? maxDelta = 1.05 * maxDelta SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) # need the full pathname when python parses the csv for numpy/sort csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) #*************************** # Parse parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) #*************************** # Inspect inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(parseResult) #*************************** # Summary summaryResult = h2o_cmd.runSummary(key=parse_key) columns = summaryResult['frames'][0]['columns'] default_pctiles = summaryResult['frames'][0]['default_pctiles'] co = OutputObj(columns[0], 'summary') coList = [ co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros] for c in coList: print c print "len(co.bins):", len(co.bins) print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean) print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma) print "FIX! hacking the co.pctiles because it's short by two" summ_pctiles = [0] + co.pctiles + [0] pt = h2o_util.twoDecimals(summ_pctiles) mx = h2o_util.twoDecimals(co.maxs) mn = h2o_util.twoDecimals(co.mins) exp = h2o_util.twoDecimals(expected[1:]) print "co.label:", co.label, "co.pctiles (2 places):", pt print "default_pctiles:", default_pctiles print "co.label:", co.label, "co.maxs: (2 places):", mx print "co.label:", co.label, "co.mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ mn[0], pt[3], pt[5], pt[7], mx[0]) h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\ exp[0], exp[1], exp[2], exp[3], exp[4]) #*************************** # Quantile # the thresholds h2o used, should match what we expected # using + here seems to result in an odd tuple..doesn't look right to h2o param # so went with this. Could add '[' and ']' to the list first, before the join. probsStr = "[%s]" % ",".join(map(str,probsList)) parameters = { 'destination_key': "a.hex", 'training_frame': parse_key, 'validation_frame': parse_key, 'ignored_columns': None, 'score_each_iteration': False, 'probs': probsStr, } model_key = 'qhex' bmResult = h2o.n0.build_model( algo='quantile', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') msec = bm.jobs[0]['msec'] print "bm msec", msec # quantile result is just a job result to a key modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0], 'model') # model output { # "domains": [ # null # ], # "iters": 1.0, # "model_category": null, # "names": [ # "C1" # ], # "quantiles": [ # [ # 10009.03502345 # ] # ], print "model.output:", model.output print "model.output:['quantiles']", model.output['quantiles'] print "model.output:['iters']", model.output['iters'] print "model.output:['names']", model.output['names'] quantiles = model.output['quantiles'][0] # why is this a double array iters = model.output['iters'] assert iters == 11, iters print "quantiles: ", quantiles print "iters: ", quantiles # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # cmm = OutputObj(cmmResult, 'cmm') # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) # mm = OutputObj(mmResult, 'mm') # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView() trial += 1 # compare the last threshold if colname!='': # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, col=0, # what col to extract from the csv datatype='float', quantile=CHECK_PCTILE, # h2oSummary2=pctile[-1], # h2oQuantilesApprox=result, # from exec h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX], ) h2o.nodes[0].remove_all_keys()