def test_parse_manyfiles_1(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirname = "manyfiles-nflx-gz" timeoutSecs = 600 trial = 0 for iteration in range(ITERATIONS): csvFilename = "file_1.dat.gz" csvPathname = csvDirname + "/" + csvFilename trialStart = time.time() # import***************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # the import has to overwrite existing keys. no parse h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "import", trial, "end ", 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # STOREVIEW*************************************** print "\nTrying StoreView after the import" for node in h2o.nodes: h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_50_nongz_fvec(self): avgMichalSize = 237270000 bucket = "home-0xdiag-datasets" importFolderPath = "manyfiles-nflx-gz" print "Using non-gz'ed files in", importFolderPath csvFilenameList = [ # ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800), ("*[1][0-4][0-9].dat.gz", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern hex_key = csvFilename + ".hex" (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") importFullList = importResult["files"] importFailList = importResult["fails"] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) parseResult = h2i.import_parse( bucket=bucket, path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=600 ) execExpr = "A.hex=%s" % parseResult["destination_key"] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.runStoreView(timeoutSecs=60)
def test_exec2_fast_locks(self): csvPathname = 'iris/iris2.csv' src_key='iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 for trial in range (1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # just show the jobs still going, if any. maybe none, because short (iris) a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_50_nongz_fvec(self): h2o.beta_features = True avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' importFolderPath = 'airlines' print "Using non-gz'ed files in", importFolderPath csvFilenameList= [ ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) h2o_cmd.runStoreView(timeoutSecs=60)
def test_A_store_view(self): # size of H2O store store_size = 0 # import data to have more files in the system r = h2i.import_only(bucket='smalldata', path='iris/*') store_size += len(r[0]['files']) r = h2i.import_only(bucket='smalldata', path='covtype/*') store_size += len(r[0]['files']) # list all items r = h2o.nodes[0].store_view(view=store_size) self.assertEqual(store_size, len(r['keys'])) # list over views including only 3 items items_per_page = 3 # items per page pages = (store_size / items_per_page) # number of pages if (store_size % items_per_page != 0): pages += 1 offset = 0 # running offset cnt_items = 0 # counter of returned items for p in range(0,pages): r = h2o.nodes[0].store_view(offset=offset, view=items_per_page) print h2o.dump_json(r) cnt_items += len(r['keys']) offset += items_per_page self.assertEqual(store_size, cnt_items)
def test_parse_multi_exclude_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (300, 100, 'cA', 60, '*x[2-5]*'), (310, 200, 'cB', 60, '*x[1,3-5]*'), (320, 300, 'cC', 60, '*x[1-2,4-5]*'), (330, 400, 'cD', 60, '*x[1-3-5]*'), (340, 500, 'cE', 60, '*x[1-4]*'), ] ## h2b.browseTheCloud() cnum = 0 # create them all first for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # FIX! should we add a header to them randomly??? print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) for fileN in range(FILENUM): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList) for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: print f h2i.import_only(path=SYNDATASETS_DIR + "/" + f) # pattern match all, then use exclude parseResult = h2i.parse_only(pattern="*/syn_*", hex_key=hex_key, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs) print "parseResult['destination_key']: " + parseResult['destination_key'] print 'parse time:', parseResult['response']['time'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # FIX! h2o strips one of the headers, but treats all the other files with headers as data numRows = inspect['numRows'] numCols = inspect['numCols'] print "\n" + parseResult['destination_key'] + ":", \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # all should have rowCount rows (due to the excludePattern self.assertEqual(numRows, rowCount*FILENUM, msg=("got numRows: %s. Should be rowCount: %s * FILENUM: %s" % \ (numRows, rowCount, FILENUM)))
def test_cols_enum_multi_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] tryList = [ (300, 100, 'cA', 60, '*x[2-5]*'), (310, 200, 'cB', 60, '*x[1,3-5]*'), (320, 300, 'cC', 60, '*x[1-2,4-5]*'), (330, 400, 'cD', 60, '*x[1-3-5]*'), (340, 500, 'cE', 60, '*x[1-4]*'), ] ## h2b.browseTheCloud() cnum = 0 # create them all first for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # FIX! should we add a header to them randomly??? print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) for fileN in range(FILENUM): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList) for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList: cnum += 1 # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: print f h2i.import_only(path=SYNDATASETS_DIR + "/" + f) # pattern match all, then use exclude parseResult = h2i.parse_only(pattern="*/syn_*", hex_key=hex_key, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs) print "parseResult['destination_key']: " + parseResult['destination_key'] print 'parse time:', parseResult['response']['time'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # FIX! h2o strips one of the headers, but treats all the other files with headers as data num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "\n" + parseResult['destination_key'] + ":", \ " num_rows:", "{:,}".format(num_rows), \ " num_cols:", "{:,}".format(num_cols) # all should have rowCount rows (due to the excludePattern self.assertEqual(num_rows, rowCount*FILENUM, msg=("got num_rows: %s. Should be rowCount: %s * FILENUM: %s" % \ (num_rows, rowCount, FILENUM)))
def test_parse_manyfiles_1(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirname = "manyfiles-nflx-gz" timeoutSecs = 600 trial = 0 for iteration in range(ITERATIONS): if DO_UNCOMPRESSED: csvFilename = "a_1.dat" else: csvFilename = "file_1.dat.gz" csvPathname = csvDirname + "/" + csvFilename trialStart = time.time() # import***************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # the import has to overwrite existing keys. no parse h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "import", trial, "end ", 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # STOREVIEW*************************************** print "\nTrying StoreView after the import" for node in h2o.nodes: h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000) # exec does read lock on all existing keys if DO_EXEC: # fails execExpr = "A.hex=c(0,1)" # execExpr="A.hex=0;" h2e.exec_expr(execExpr=execExpr, timeoutSecs=20) h2o_cmd.runInspect(key='A.hex') print "\nTrying StoreView after the exec " h2o_cmd.runStoreView(timeoutSecs=30, view=10000) # for node in h2o.nodes: # h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000) print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_50_nongz_fvec(self): avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx-gz' print "Using non-gz'ed files in", importFolderPath csvFilenameList = [ # ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800), ("*[1][0-4][0-9].dat.gz", "file_50_A.dat", 50 * avgMichalSize, 1800 ), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern hex_key = csvFilename + ".hex" (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=600) execExpr = "A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) h2o_cmd.runStoreView(timeoutSecs=60)
def test_H_Basic(self): # maybe best to extra the key from an import? first? # this isn't used much, maybe we don't care about this h2i.import_only(path="testdir_multi_jvm/syn_test/syn_header.csv") headerKey = h2i.find_key('syn_header.csv') # comma 44 is separator h2i.import_parse(path="testdir_multi_jvm/syn_test/syn[1-2].csv", header=1, header_from_file=headerKey, separator=44) # symbolic links work # ln -s /home/0xdiag/datasets home-0xdiag-datasets # lrwxrwxrwx 1 kevin kevin 21 Aug 26 22:05 home-0xdiag-datasets -> /home/0xdiag/datasets h2i.import_parse(path="standard/covtype.data", bucket="home-0xdiag-datasets")
def test_parse_airline_multi_hdfs_many(self): # default csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder for tryHeap in [24]: print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 500 importFolderPath = "datasets/airlines_multi" csvPathname = importFolderPath + "/" + csvFilePattern parseResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) for trial in range(TRIAL_MAX): # each parse now just does one csvFilePattern = "*%s.csv" % trial # if we want multifile # csvFilePattern = "*" hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() # print "Don't wait for completion. Just load things up!" print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?" print "Drat. We can't re-import the folder, if there's a parse using one of the source files?" parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_GLM_mnist_s3n_fvec(self): csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600), ("mnist_training.csv.gz", "mnist_training.csv.gz", 600), ] importFolderPath = "mnist" csvPathname = importFolderPath + "/*" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=120) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: # PARSE test**************************************** csvPathname = importFolderPath + "/" + testCsvFilename testHexKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=testHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # PARSE train**************************************** csvPathname = importFolderPath + "/" + trainCsvFilename trainHexKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=trainHexKey, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # GLM**************************************** y = 0 # first column is pixel value print "y:" # don't need the intermediate Dicts produced from columnInfoFromInspect x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "x:", x kwargs = { 'response': y, # 'case_mode': '>', # 'case': 0, 'family': 'gaussian', 'lambda': 1.0E-5, 'alpha': 0.5, 'max_iter': 5, 'n_folds': 1, 'beta_epsilon': 1.0E-4, } timeoutSecs = 1800 start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "GLM completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs) print h2o.dump_json(importResult) storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30) # print h2o.dump_json(storeViewResult) hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" ) result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def uploadit(n, bucket, path, src_key, hex_key, timeoutSecs=60, retryDelaySecs=1, pollTimeoutSecs=30): # apparently the putfile has some conflicts. but afte the put completes, its okay # to be parallel with the src_key if it has a different name (importResult, importPattern) = h2i.import_only(node=h2o.nodes[n], bucket=bucket, path=path, schema='put', src_key=src_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "uploadit:", importPattern, hex_key # do the parse on the next node if UPLOAD_PARSE_DIFF_NODES: np1 = (n + 1) % len(h2o.nodes) else: np1 = n if DO_PARSE_ALSO: parseit(np1, importPattern, hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) h2o.nodes[0].rebalance(before=hex_key, after=hex_key + "_2", chunks=32) return (importPattern, hex_key)
def uploadit(n, bucket, path, src_key, hex_key, timeoutSecs=60, retryDelaySecs=1, pollTimeoutSecs=30): # apparently the putfile has some conflicts. but afte the put completes, its okay # to be parallel with the src_key if it has a different name (importResult, importPattern) = h2i.import_only( node=h2o.nodes[n], bucket=bucket, path=path, schema="put", src_key=src_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, ) print "uploadit:", importPattern, hex_key # do the parse on the next node if UPLOAD_PARSE_DIFF_NODES: np1 = (n + 1) % len(h2o.nodes) else: np1 = n if DO_PARSE_ALSO: parseit( np1, importPattern, hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, ) h2o.nodes[0].rebalance(source=hex_key, after=hex_key + "_2", chunks=32) return (importPattern, hex_key)
def test_parse_airline_multi_hdfs(self): csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) timeoutSecs = 3600 importFolderPath = "datasets/airlines_multi" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() importResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "importResult:", h2o.dump_json(importResult) parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_all_s3n_thru_hdfs(self): print "\nLoad a list of files from s3n, parse it thru HDFS" print "In EC2, michal's config always passes the right config xml" print "as arg to the java -jar h2o.jar. Only works in EC2" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/*' importResult = h2i.import_only(bucket=bucket, path=csvPathname, schema='s3n') s3nFullList = importResult['succeeded'] print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList),1,"Didn't see more than 1 files in s3n?") s3nList = random.sample(s3nFullList,8) timeoutSecs = 500 for s in s3nList: s3nKey = s['key'] s3nFilename = s['file'] # there is some non-file key names returned? s3n metadata? # only use the keys with csv in their name if ('csv' not in s3nKey) or ('syn_dataset' in s3nKey) or ('.gz' in s3nKey): continue # creates csvFilename.hex from file in hdfs dir print "Loading s3n key: ", s3nKey, 'thru HDFS' parseResult = h2i.parse_only(pattern=s3nKey, hex_key=s3nFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "parse result:", parseResult['destination_key'] start = time.time() sys.stdout.flush()
def test_exec2_fast_locks_overlap(self): csvPathname = "iris/iris2.csv" src_key = "iris.csv" if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only( bucket="smalldata", path=csvPathname, schema="put", src_key=src_key, timeoutSecs=10 ) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 lastHexKey = None for trial in range(1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only( bucket="smalldata", path=csvPathname, schema="put", src_key=src_key, timeoutSecs=10 ) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only( pattern=src_key, hex_key=hex_key, noPoll=True, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10 ) # wait until iteration 2, when lastHexKey is available, so you can operate on that if lastHexKey: execExpr = "%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y + 1, lastHexKey, y + 1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) lastHexKey = hex_key # since we are using the same source file, and potentially re-uploading if AVOID_BUG # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to # use it next iteration h2o_jobs.pollWaitJobs(timeoutSecs=10) # just show the jobs still going. Shouldn't be any a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_parse_cust(self): # run as user 0xcustomer to get access (with .json config and ssh key file specified) importFolderPath = '/mnt/0xcustomer-datasets' pollTimeoutSecs = 120 retryDelaySecs = 30 timeoutSecs = 300 (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*") importFileList = importResult['files'] importFailList = importResult['fails'] importKeyList = importResult['keys'] importDelList = importResult['dels'] if len(importDelList)!=0: raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList)) if len(importFileList)<MINFILES: raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList)) if len(importKeyList)<MINFILES: raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList)) if len(importFailList)!=0: raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList)) # only parse files with .csv or .tsv in their name (no dirs like that?) goodKeyList = [key for key in importKeyList if ('.csv' in key or '.tsv' in key)] trial = 0 # just do 1? for i, importKey in enumerate(random.sample(goodKeyList,3)): print "importKey:", importKey trial +=1 start = time.time() # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like # force header=0..should mean headers get treated as NAs parseResult = h2i.parse_only(pattern=importKey, header=0, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] origKey = parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=origKey) h2o_cmd.infoFromInspect(inspect, origKey) execExpr = 'newKey = '+origKey+'[1,1]' h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) newParseKey = {'destination_key': 'newKey'} h2o_cmd.checkKeyDistribution() h2o.nodes[0].remove_key(key=origKey) # a key isn't created for a scalar # h2o.nodes[0].remove_key(key='newKey') self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
def test_parse_summary_manyfiles_s3_fvec(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirlist = [("manyfiles-nflx-gz", 800)] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: # change to 50 files csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs ) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, ) elapsed = time.time() - start print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360) print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_nflx_loop_hdfs_fvec(self): h2o.beta_features = True print "Using the -.gz files from hdfs" # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz # default csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) else: h2o_hosts.build_cloud_with_hosts(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 500 importFolderPath = "datasets/airlines_multi" csvPathname = importFolderPath + "/" + csvFilePattern parseResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) for trial in range(TRIAL_MAX): # each parse now just does one csvFilePattern = "*%s.csv" % trial # if we want multifile # csvFilePattern = "*" hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() # print "Don't wait for completion. Just load things up!" print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?" print "Drat. We can't re-import the folder, if there's a parse using one of the source files?" parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30) h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_manyfiles_1(self): # these will be used as directory imports/parse csvDirname = "manyfiles-nflx-gz" timeoutSecs = 600 trial = 0 for iteration in range(ITERATIONS): if DO_UNCOMPRESSED: csvFilename = "a_1.dat" else: csvFilename = "file_1.dat.gz" csvPathname = csvDirname + "/" + csvFilename trialStart = time.time() # import***************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # the import has to overwrite existing keys. no parse h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "import", trial, "end ", 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # STOREVIEW*************************************** print "\nTrying StoreView after the import" for node in h2o.nodes: h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000) # exec does read lock on all existing keys if DO_EXEC: # fails execExpr="A.hex=c(0,1)" # execExpr="A.hex=0;" h2e.exec_expr(execExpr=execExpr, timeoutSecs=20) h2o_cmd.runInspect(key='A.hex') print "\nTrying StoreView after the exec " h2o_cmd.runStoreView(timeoutSecs=30, view=10000) # for node in h2o.nodes: # h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_exec2_fast_locks_overlap(self): csvPathname = 'iris/iris2.csv' src_key='iris.csv' if not AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) y = 4 lastHexKey = None for trial in range (1, 100): if AVOID_BUG: # need the key name (pattern) to feed to parse) (importResult, importPattern) = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', src_key=src_key, timeoutSecs=10) # just as a reminder of what these returns look like print "importResult:", h2o.dump_json(importResult) print "importPattern:", h2o.dump_json(importPattern) # make sure each parse is unique dest key (not in use) hex_key = "iris2_" + str(trial) + ".hex" # what if we kicked off another parse without waiting for it? I think the src key gets locked # so we'd get lock issues on the src_key parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, noPoll=True, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10) # wait until iteration 2, when lastHexKey is available, so you can operate on that if lastHexKey: execExpr="%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y+1, lastHexKey, y+1, 1) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) lastHexKey = hex_key # since we are using the same source file, and potentially re-uploading if AVOID_BUG # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to # use it next iteration h2o_jobs.pollWaitJobs(timeoutSecs=10) # just show the jobs still going. Shouldn't be any a = h2o.nodes[0].jobs_admin() h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
def test_parse_summary_airline_s3(self): h2o.beta_features = True csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] bucket = 'h2o-airlines-unpacked' (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3') s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import s3" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # this is schema='local'k parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_with_cancel(self): mustWait = 10 importFolderPath = 'standard' timeoutSecs = 500 csvFilenameList = [ ("standard", "covtype.data", 54), ("manyfiles-nflx-gz", "file_1.dat.gz", 378), ("standard", "covtype20x.data", 54), ("manyfiles-nflx-gz", "file_[100-109].dat.gz", 378), ] # just loop on the same file. If remnants exist and are locked, we will blow up? # Maybe try to do an inspect to see if either the source key or parse key exist and cause stack traces for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=50) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500, noPoll=True, doSummary=False) job_key = parseResult['job_key'] # give it a little time to start time.sleep(3) h2o.nodes[0].jobs_cancel(key=job_key) # now wait until the job cancels, and we're idle h2o_jobs.pollWaitJobs(timeoutSecs=30) elapsed = time.time() - start print "Cancelled parse completed in", elapsed, "seconds." h2o.check_sandbox_for_errors() # get a list of keys from storview. 20 is fine..shouldn't be many, since we putfile, not import folder # there maybe a lot since we import the whole "standard" folder # find the ones that pattern match the csvFilename, and inspect them. Might be none storeViewResult = h2o_cmd.runStoreView(timeoutSecs=timeoutSecs, view=100) keys = storeViewResult['keys'] for k in keys: keyName = k['key'] print "kevin:", keyName if csvFilename in keyName: h2o_cmd.runInspect(key=keyName) h2o.check_sandbox_for_errors() # This will tell h2o to delete using the key name from the import file, whatever pattern matches to csvFilename # we shouldn't have to do this..the import/parse should be able to overwrite without deleting. # h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) # If you cancel a parse, you aren't allowed to reparse the same file or import a directory with that file, # or cause the key name that the parse would have used, for 5 seconds after the cancel request gets a json # response print "Waiting", mustWait, "seconds before next reparse-cancel." time.sleep(mustWait)
def test_parse_summary_airline_s3(self): csvFilelist = [ ("allyears2k.csv", 300), #4.4MB ("year1987.csv", 600), #130MB ("allyears.csv", 900), #12GB # ("allyears_10.csv", 1800), #119.98GB ] bucket = 'h2o-airlines-unpacked' (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3') s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import s3" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() # this is schema='local'k parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_zip_s3_fvec(self): h2o.beta_features = True csvFilelist = [ ("test_set.zip", 300), # 110.9MB ("train_set.zip", 600), # 362.9MB ] (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path="allstate", schema='s3') print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** csvPathname = "allstate/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_parse_summary_manyfiles_1_fvec(self): h2o.beta_features = True # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_1.dat.gz" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs) print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(numCols, 542) self.assertEqual(numRows, 100000) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** # pass numRows, so we know when na cnt means row is all na's summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360, numCols=numCols, numRows=numRows) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_50_nongz_fvec(self): h2o.beta_features = True avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' importFolderPath = 'airlines' print "Using non-gz'ed files in", importFolderPath csvFilenameList = [ ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) h2o_cmd.runStoreView(timeoutSecs=60)
def test_parse_summary_manyfiles_s3n(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_summary_manyfiles_s3n(self): # these will be used as directory imports/parse csvDirlist = [ ("manyfiles-nflx-gz", 600), ] trial = 0 for (csvDirname, timeoutSecs) in csvDirlist: csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz" (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs) s3nFullList = importHDFSResult['succeeded'] self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?") print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trialStart = time.time() # PARSE**************************************** hex_key = csvDirname + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # gives us some reporting on missing values, constant values, to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300) # SUMMARY**************************************** summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_50_nongz_fvec(self): avgMichalSize = 237270000 * 2 bucket = 'home-0xdiag-datasets' importFolderPath = "many_many" print "Using non-gz'ed files in", importFolderPath csvFilenameList = [ ("*.dat", "file_18_A.dat", 18 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult[ 'destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg
def test_parse_airline_multi_hdfs(self): h2o.beta_features = True csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) else: # why is 55609 already in use?? h2o_hosts.build_cloud_with_hosts(sandbox_ignore_errors=True, force_tcp=True, java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55604, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandboxIgnoreErrors = True timeoutSecs = 3600 importFolderPath = "datasets/airlines_multi" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() importResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "importResult:", h2o.dump_json(importResult) parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_airline_multi_hdfs(self): csvFilename = "hex_10" csvFilePattern = '*' # all files in the folder trialMax = 2 for tryHeap in [24]: print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse" h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, disable_assertions=DISABLE_ASSERTIONS, use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION) timeoutSecs = 3600 importFolderPath = "datasets/airlines_multi" for trial in range(trialMax): hex_key = csvFilename + "_" + str(trial) + ".hex" csvPathname = importFolderPath + "/" + csvFilePattern start = time.time() importResult = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print "importResult:", h2o.dump_json(importResult) parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() # we don't delete the hex key. it will start spilling? slow h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_parse_with_cancel(self): importFolderPath = 'standard' timeoutSecs = 500 csvFilenameList = [ ("standard", "covtype.data", 54), ("manyfiles-nflx-gz", "file_1.dat.gz", 378), ("standard", "covtype20x.data", 54), ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), ] # just loop on the same file. If remnants exist and are locked, we will blow up? # Maybe try to do an inspect to see if either the source key or parse key exist and cause stack traces for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=50) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500, noPoll=True, doSummary=False) job_key = parseResult['job_key'] # give it a little time to start time.sleep(3) h2o.nodes[0].jobs_cancel(key=job_key) # now wait until the job cancels, and we're idle h2o_jobs.pollWaitJobs(timeoutSecs=30) elapsed = time.time() - start print "Cancelled parse completed in", elapsed, "seconds." h2o.check_sandbox_for_errors() # get a list of keys from storeview. 20 is fine..shouldn't be many, since we putfile, not import folder # there maybe a lot since we import the whole "standard" folder # find the ones that pattern match the csvFilename, and inspect them. Might be none storeViewResult = h2o_cmd.runStoreView(timeoutSecs=timeoutSecs, view=100) keys = storeViewResult['keys'] for k in keys: keyName = k['key'] print "kevin:", keyName if csvFilename in keyName: h2o_cmd.runInspect(key=keyName) h2o.check_sandbox_for_errors()
def test_parse_summary_zip_s3_fvec(self): h2o.beta_features = True csvFilelist = [ ("test_set.zip", 300), # 110.9MB ("train_set.zip", 600), # 362.9MB ] (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path="allstate", schema='s3') print "\nTrying StoreView after the import hdfs" h2o_cmd.runStoreView(timeoutSecs=120) trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() csvPathname = csvFilename # PARSE**************************************** csvPathname = "allstate/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120) elapsed = time.time() - start print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # INSPECT****************************************** # We should be able to see the parse result? start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult) # STOREVIEW*************************************** print "\nTrying StoreView after the parse" h2o_cmd.runStoreView(timeoutSecs=120) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds." trial += 1
def test_parse_all_s3n_thru_hdfs(self): print "\nLoad a list of files from s3n, parse it thru HDFS" print "In EC2, michal's config always passes the right config xml" print "as arg to the java -jar h2o.jar. Only works in EC2" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/*' importResult = h2i.import_only(bucket=bucket, path=csvPathname, schema='s3n') s3nFullList = importResult['succeeded'] print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList), 1, "Didn't see more than 1 files in s3n?") s3nList = random.sample(s3nFullList, 8) timeoutSecs = 500 for s in s3nList: s3nKey = s['key'] s3nFilename = s['file'] # there is some non-file key names returned? s3n metadata? # only use the keys with csv in their name if ('csv' not in s3nKey) or ('syn_dataset' in s3nKey) or ('.gz' in s3nKey): continue # creates csvFilename.hex from file in hdfs dir print "Loading s3n key: ", s3nKey, 'thru HDFS' parseResult = h2i.parse_only(pattern=s3nKey, hex_key=s3nFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) print s3nFilename, 'parse time:', parseResult['response']['time'] print "parse result:", parseResult['destination_key'] start = time.time() sys.stdout.flush()
def test_50_nongz_fvec(self): h2o.beta_features = True avgMichalSize = 237270000 bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using non-gz'ed files in", importFolderPath csvFilenameList= [ ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800), ] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg
def parseFile(self, importFolderPath='datasets', csvFilename='airlines_all.csv', timeoutSecs=500, **kwargs): csvPathname = importFolderPath + "/" + csvFilename start = time.time() # do an import first, because we want to get the size of the file (importResult, importPattern) = h2i.import_only(path=csvPathname, schema="hdfs", timeoutSecs=timeoutSecs) succeeded = importResult['succeeded'] if len(succeeded) < 1: raise Exception("Should have imported at least 1 key for %s" % csvPathname) # just do a search foundIt = None for f in succeeded: if csvPathname in f['key']: foundIt = f print "foundit f:", f break if not foundIt: raise Exception("Should have found %s in the imported keys for %s" % (importPattern, csvPathname)) parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs) elapsed = time.time() - start print "Parse of", parseResult['destination_key'], "took", elapsed, "seconds" parseResult['python_call_timer'] = elapsed print "Parse result['destination_key']:", parseResult['destination_key'] start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=200) elapsed = time.time() - start print "Inspect:", parseResult['destination_key'], "took", elapsed, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] print "num_rows:", num_rows, "num_cols", num_cols return parseResult
def test_rf_mnist_both_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "Not using ignore from this..have to adjust cols?" h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntree = 2 params = { 'response': 'C1', # 'ignored_cols_by_name': ignore_x, 'ntrees': ntree, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': 'RF_model', 'nbins': 100, 'importance': 0, 'balance_classes': 0, } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print 'rfView:', h2o.dump_json(rfView) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['drf_model']['_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) # training and test data are unique, so error won't be low? # self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error) leaves = { 'min': rfView['drf_model']['treeStats']['minLeaves'], 'mean': rfView['drf_model']['treeStats']['meanLeaves'], 'max': rfView['drf_model']['treeStats']['maxLeaves'], } # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 537, 'mean': 1118.05, 'max': 1701} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100 d = "seed: %s leaves %s %s %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = { 'min': rfView['drf_model']['treeStats']['minDepth'], 'mean': rfView['drf_model']['treeStats']['meanDepth'], 'max': rfView['drf_model']['treeStats']['maxDepth'], } depthExpected = {'min': 20, 'mean': 20, 'max': 20} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100 d = "seed: %s depth %s %s %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_GBM_with_cancels(self): print "Sets h2o.beta_features like -bf at command line" print "this will redirect import and parse to the 2 variants" h2o.beta_features = True importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename ### h2o.beta_features = False (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # hack it because no response from Parse2 if h2o.beta_features: parseResult = {'destination_key': 'c.hex'} print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult['destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified if importFolderPath=='manyfiles-nflx-gz': if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1) kwargs = { 'str': execExpr } resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) x = range(542) # remove the output too! (378) xIgnore = [] # BUG if you add unsorted 378 to end. remove for now for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]: if i not in x: print "x:", x print 'missing?', i x.remove(i) xIgnore.append(i) x = ",".join(map(str,x)) def colIt(x): return "C" + str(x) xIgnore = ",".join(map(colIt, xIgnore)) else: # leave one col ignored, just to see? xIgnore = 0 modelKey = "GBMGood" params = { 'destination_key': modelKey, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult) # no pattern waits for all for i in range(20): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): kwargs['destination_key'] = 'GBMBad' + str(j) GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) jobids.append(GBMFirstResult['job_key']) # have to pass the job id for j in jobids: h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) elapsed = time.time() - start print "GBM training completed in", elapsed, "seconds." gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey) # errrs from end of list? is that the last tree? errsLast = gbmTrainView['gbm_model']['errs'][-1] print "GBM 'errsLast'", errsLast if DO_CLASSIFICATION: cm = gbmTrainView['gbm_model']['cm'] pctWrongTrain = h2o_gbm.pp_cm_summary(cm); print "Last line of this cm might be NAs, not CM" print "\nTrain\n==========\n" print h2o_gbm.pp_cm(cm) else: print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs']) h2o.check_sandbox_for_errors() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def test_parse_multi_header_single_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception( "Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() parseResult = h2i.parse_only(pattern='*' + rowxcol + '*', hex_key=hex_key, timeoutSecs=timeoutSecs, header="1", header_from_file=header) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], totalCols, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE' } else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
def test_parse_nflx_loop_s3n_hdfs(self): DO_GLM = True DO_GLMGRID = False USE_S3 = False noPoll = False benchmarkLogging = ['jstack','iostats'] benchmarkLogging = ['iostats'] benchmarkLogging = [] # typical size of the michal files avgMichalSize = 116561140 avgSynSize = 4020000 synSize = 183 csvFilenameList = [ (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300), (["manyfiles-nflx-gz"], "*file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700), (["manyfiles-nflx-gz"], "*file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900), (["manyfiles-nflx-gz"], "*file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), (["manyfiles-nflx-gz"], "*file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use (["A-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200), (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"], "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200), ] print "Using the -.gz files from s3" # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz # split out the pattern match and the filename used for the hex trialMax = 1 pollTimeoutSecs = 180 retryDelaySecs = 10 # use i to forward reference in the list, so we can do multiple outstanding parses below for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): bucket = "home-0xdiag-datasets" ## for tryHeap in [54, 28]: h2oPerNode = 1 # h1.4xlarge 60.5GB dram for tryHeap in [28]: if USE_S3: protocol = "s3" else: protocol = "s3n" print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse" # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" # jea = "-Dh2o.find-ByteBuffer-leaks=true" h2o.init(h2oPerNode, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10) # java_extra_args=jea, # don't raise exception if we find something bad in h2o stdout/stderr? h2o.nodes[0].sandboxIgnoreErrors = True for trial in range(trialMax): # import a list of folders, one at a time (hdfs import can't take pattern match # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket # too slow for csvFolder in csvFolderList: # since we delete the key, we have to re-import every iteration, to get it again # s3n URI thru HDFS is not typical. if USE_S3: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='s3') else: (importResult, importPattern) = h2i.import_only( bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='hdfs') foundKeys = 0 for s in importResult['succeeded']: # just print the first tile # if 'nflx' in key and 'file_1.dat.gz' in key: if csvFilepattern in s['key']: # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", s['key'] break else: pass foundKeys += 1 ### print "s3nFullList:", h2o.dump_json(s3nFullList) # error if none? self.assertGreater(foundKeys,8,"Didn't see more than 8 files in s3n?") src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if noPoll: if (i+1) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1] src_key = csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse2Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) if (i+2) < len(csvFilenameList): time.sleep(1) h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2] src_key = URI + csvFilepattern hex_key = csvFilename + "_" + str(trial) + ".hex" print "Loading", protocol, "key:", src_key, "to", hex_key parse3Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilepattern, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "parse result:", parseResult['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) y = 378 if not noPoll: x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM or DO_GLMGRID: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y]: x.remove(i) x = ",".join(map(str,x)) if DO_GLM: algo = 'GLM' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5} start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) else: algo = 'GLMGrid' GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial', 'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4, 'lambda': '1e-4', 'alpha': '0,0.5', 'thresholds': '0.5' } start = time.time() glm = h2o_cmd.runGLMGrid(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, **GLMkwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs) h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." ### storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz" # have to do the pattern match ourself, to figure out what keys to delete # we're deleting the keys in the initial import. We leave the keys we created # by the parse. We use unique dest keys for those, so no worries. # Leaving them is good because things fill up! (spill) h2o_cmd.checkKeyDistribution() h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) h2o.tear_down_cloud() # sticky ports? wait a bit. print "Waiting 30 secs before building cloud again (sticky ports?)" time.sleep(30)
def sub_c3_fvec_long(self): h2o.beta_features = True # a kludge h2o.setup_benchmark_log() avgMichalSize = 116561140 bucket = "home-0xdiag-datasets" ### importFolderPath = 'more1_1200_link' importFolderPath = "manyfiles-nflx-gz" print "Using .gz'ed files in", importFolderPath if len(h2o.nodes) == 1: csvFilenameList = [("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600)] else: csvFilenameList = [ ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 1800), ] if LOG_MACHINE_STATS: benchmarkLogging = ["cpu", "disk", "network"] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local") importFullList = importResult["files"] importFailList = importResult["fails"] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse( bucket=bucket, path=csvPathname, schema="local", hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging, ) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "Parse result['destination_key']:", parseResult["destination_key"] h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed msg = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed ) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541]: x.remove(i) ignore_x.append(i) x.remove(378) # add one since we are no longer 0 based offset x = ",".join(map(lambda x: "C" + str(x + 1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x)) GLMkwargs = { "ignored_cols": ignore_x, "response": "C379", "max_iter": 4, "n_folds": 1, "family": "binomial", "alpha": 0.2, "lambda": 1e-5, } # convert to binomial execExpr = "A.hex=%s" % parseResult["destination_key"] h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ("C379", "C379", 15) h2e.exec_expr(execExpr=execExpr, timeoutSecs=60) aHack = {"destination_key": "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = "{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed ) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_exec_enums_rand_cut2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ # (n, 10, 9, 'cE', 300), (n, 1, 1, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression MAX_COLS_IN_EXPR = iColCount cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1, MAX_COLS_IN_EXPR)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings if 1 == 1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # randomly pick == or != if random.randint(0, 1) == 0: cutExprList.append('p$C' + str(i + 1) + '!=' + c) else: cutExprList.append('p$C' + str(i + 1) + '==' + c) cutExpr = ' & '.join(cutExprList) # print "cutExpr:", cutExpr # just extract one output col (the first one) rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount + 1) # print "rowExpr:", rowExpr print rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* src_key = csvFilename parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J' + src_key, timeoutSecs=200) parseResult = h2i.parse_only(pattern='*' + src_key, hex_key=hex_key, timeoutSecs=800) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) pNumRows = inspect['numRows'] pNumCols = inspect['numCols'] # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception( "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1 == 1: a = 'a=c(1,2,3);' + ';'.join( ['a[,%s]=a[,%s-1]' % (i, i) for i in range(2, colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(CUT_LOOP_CNT): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount column = 0 start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) #**************************************************************** # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col" quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=hex_key, column='C' + str(iColCount + 1), quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0) elapsed = time.time() - start h2p.red_print( hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount + 1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result']) print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.' quantileTime = elapsed #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
csvWrt.writerow(row) finally: output.close() if __name__ == '__main__': debug = sys.argv.pop(-1) build = sys.argv.pop(-1) h2o.parse_our_args() h2o_hosts.build_cloud_with_hosts(enable_benchmark_log=False) #AIRLINES airlinesTestParseStart = time.time() hK = "AirlinesHeader.csv" headerPathname = "bench/Airlines" + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path='bench/Airlines/AirlinesTest.csv', schema='local', hex_key="atest.hex", header=1, header_from_file=headerKey, separator=44, timeoutSecs=4800, retryDelaySecs=5, pollTimeoutSecs=4800) elapsedAirlinesTestParse = time.time() - airlinesTestParseStart row = {'testParseWallTime': elapsedAirlinesTestParse} response = 'IsDepDelayed' ignored = None
def sub_c3_nongz_fvec_long(self, csvFilenameList): # a kludge h2o.setup_benchmark_log() bucket = 'home-0xdiag-datasets' importFolderPath = 'manyfiles-nflx' print "Using nongz'ed files in", importFolderPath if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern if DO_DOUBLE_IMPORT: (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList) # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key="A.hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # output 378 can't be in this ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541] ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'ignored_cols': ignore_x, 'response': 'C379', 'max_iter': 10, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } # convert to binomial # execExpr="A.hex=%s" % parseResult['destination_key'] # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) # are the unparsed keys slowing down exec? h2i.delete_keys_at_all_nodes(pattern="manyfile") execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)' h2e.exec_expr(execExpr=execExpr, timeoutSecs=180) aHack = {'destination_key': "A.hex"} start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_RF_mnist_both(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** # print "This is the 'ignore=' we'll use" # no longer use. depend on h2o to get it right. ntree = 25 params = { 'response': 0, 'ntrees': ntree, # 'data_key='mnist_training.csv.hex' 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 2147483647, 'select_stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample_rate': 0.67, 'oobee': 1, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'destination_key': 'RF_model', 'nbins': 1024, # 'seed': 784834182943470027, # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0, sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # RFView (score on test)**************************************** (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) # was 2.84 # sometimes get 2.87? self.assertAlmostEqual( classification_error, 1.6, delta=1.6, msg="Classification error %s differs too much" % classification_error) treeStats = rfView['speedrf_model']['treeStats'] leaves = { 'min': treeStats['minLeaves'], 'mean': treeStats['meanLeaves'], 'max': treeStats['maxLeaves'] } # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100 d = "seed: %s %s leaves: %s expected: %s pct. different %s" % ( params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = { 'min': treeStats['minDepth'], 'mean': treeStats['meanDepth'], 'max': treeStats['maxDepth'] } depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100 d = "seed: %s %s depth: %s expected: %s pct. different %s" % ( params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() modelKey = rfView['speedrf_model']['_key'] predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_storeview_import(self): SYNDATASETS_DIR = h2o.make_syn_dir() importFolderPath = "standard" csvFilelist = [ ("covtype.data", 300), ] trial = 0 for (csvFilename, timeoutSecs) in csvFilelist: csvPathname = importFolderPath + "/" + csvFilename trialStart = time.time() # PARSE**************************************** importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs) print h2o.dump_json(importResult) storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30) # print h2o.dump_json(storeViewResult) hex_key = csvFilename + "_" + str(trial) + ".hex" print "parse start on:", csvFilename start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300) summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) # STOREVIEW*************************************** print "Trying StoreView to all nodes after the parse" for n, node in enumerate(h2o.nodes): print "\n*****************" print "StoreView node %s:%s" % (node.http_addr, node.port) storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30) f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w") result = h2o.dump_json(storeViewResult) f.close() lastStoreViewResult = storeViewResult print "Trial #", trial, "completed in", time.time( ) - trialStart, "seconds." trial += 1
def test_GBM_cancel_model_reuse(self): h2o.beta_features = True importFolderPath = 'standard' timeoutSecs = 500 csvFilenameAll = [ # have to use col name for response? ("manyfiles-nflx-gz", "file_1.dat.gz", 378), # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378), # ("standard", "covtype.data", 54), # ("standard", "covtype20x.data", 54), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() for (importFolderPath, csvFilename, response) in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename print "FIX! is this guy getting cancelled because he's reusing a key name? but it should be okay?" (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json h2o.check_sandbox_for_errors() # wait for it to show up in jobs? ## time.sleep(2) # no pattern waits for all ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # print "\nparseResult", h2o.dump_json(parseResult) print "Parse result['destination_key']:", parseResult['destination_key'] ## What's wrong here? too big? ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True) h2o.check_sandbox_for_errors() # have to avoid this on nflx data. colswap with exec # Exception: rjson error in gbm: Argument 'response' error: # Only integer or enum/factor columns can be classified if DO_CLASSIFICATION: # need to flip the right col! (R wise) execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1) kwargs = { 'str': execExpr } resultExec = h2o_cmd.runExec(**kwargs) # lets look at the response column now s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1) # x = range(542) # remove the output too! (378) ignoreIndex = [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response] # have to add 1 for col start with 1, now. plus the C xIgnore = ",".join(["C" + str(i+1) for i in ignoreIndex]) params = { 'destination_key': None, 'ignored_cols_by_name': xIgnore, 'learn_rate': .1, 'ntrees': 2, 'max_depth': 8, 'min_rows': 1, 'response': "C" + str(response+1), 'classification': 1 if DO_CLASSIFICATION else 0, 'grid_parallelism': 4, } kwargs = params.copy() timeoutSecs = 1800 for i in range(5): # now issue a couple background GBM jobs that we'll kill jobids = [] for j in range(5): # FIX! apparently we can't reuse a model key after a cancel kwargs['destination_key'] = 'GBMBad' + str(j) # rjson error in poll_url: Job was cancelled by user! GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs) jobids.append(GBMFirstResult['job_key']) h2o.check_sandbox_for_errors() # have to pass the job id # for j in jobids: # h2o.nodes[0].jobs_cancel(key=j) h2o_jobs.cancelAllJobs() # PUB-361. going to wait after cancel before reusing keys time.sleep(3) # am I getting a subsequent parse job cancelled? h2o_jobs.showAllJobs() if DELETE_KEYS: h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row): bench = "bench" if debug: print "Doing GBM DEBUG" bench = "bench/debug" date = '-'.join([str(x) for x in list(time.localtime())][0:3]) for f in fs['train']: overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv' if not os.path.exists(gbmbenchcsv): output = open(gbmbenchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(gbmbenchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x', 'CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=5, pollTimeoutSecs=7200) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime, " seconds." inspect_train = h2o.nodes[0].inspect( parseResult['destination_key']) inspect_test = h2o.nodes[0].inspect(testFilehex) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len( h2o_hosts.hosts) row.update({ 'h2o_build': build, 'nMachines': nMachines, 'nJVMs': len(h2o.nodes), 'Xmx/JVM': java_heap_GB, 'dataset': f, 'nTrainRows': inspect_train['numRows'], 'nTestRows': inspect_test['numRows'], 'nCols': inspect_train['numCols'], 'trainParseWallTime': parseWallTime, 'classification': classification, }) params = { 'destination_key': 'GBM(' + f + ')', 'response': response, 'ignored_cols_by_name': ignored_cols, 'classification': classification, 'validation': testFilehex, 'ntrees': ntrees, 'max_depth': depth, 'min_rows': minrows, 'nbins': nbins, 'learn_rate': learnRate, } kwargs = params.copy() gbmStart = time.time() #TODO(spencer): Uses jobs to poll for gbm completion h2o.beta_features = True gbm = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=4800, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=120, retryDelaySecs=5) h2o.beta_features = False gbmTime = time.time() - gbmStart row.update({ 'gbmBuildTime': gbmTime, }) #TODO(spencer): Add in gbm scoring #gbmScoreStart = time.time() #gbmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key']) #scoreTime = time.time() - gbmScoreStart csvWrt.writerow(row) finally: output.close()
def sub_c2_rel_long(self): # a kludge h2o.setup_benchmark_log() avgMichalSize = 116561140 bucket = 'home-0xdiag-datasets' ### importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath if len(h2o.nodes)==1: csvFilenameList= [ ("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600), ] else: csvFilenameList= [ ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800), # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ] if LOG_MACHINE_STATS: benchmarkLogging = ['cpu', 'disk', 'network'] else: benchmarkLogging = [] pollTimeoutSecs = 120 retryDelaySecs = 10 for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local') # this accumulates performance stats into a benchmark log over multiple runs # good for tracking whether we're getting slower or faster h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------") start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Parse result['destination_key']:", parseResult['destination_key'] h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) if totalBytes is not None: fileMBS = (totalBytes/1e6)/elapsed msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed) print msg h2o.cloudPerfH2O.message(msg) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) ignore_x = [] # for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]: for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]: x.remove(i) ignore_x.append(i) # increment by one, because we are no long zero offset! x = ",".join(map(lambda x: "C" + str(x+1), x)) ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x)) GLMkwargs = { 'family': 'binomial', 'x': x, 'y': 'C379', 'case': 15, 'case_mode': '>', 'max_iter': 4, 'n_folds': 1, 'family': 'binomial', 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed) print msg h2o.cloudPerfH2O.message(msg) h2o_cmd.checkKeyDistribution()
def test_hdfs_cdh4_fvec(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ # "3G_poker_shuffle" ("and-testing.data", 60), ### "arcene2_train.both", ### "arcene_train.both", ### "bestbuy_test.csv", ("covtype.data", 60), ("covtype4x.shuffle.data", 60), # "four_billion_rows.csv", ("hhp.unbalanced.012.data.gz", 60), ("hhp.unbalanced.data.gz", 60), ("leads.csv", 60), # ("covtype.169x.data", 600), ("prostate_long_1G.csv", 600), # ("airlines_all.csv", 900), ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() trial = 0 print "try importing /tmp2" d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) d = h2i.import_only(path="datasets/*", schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) for (csvFilename, timeoutSecs) in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a.hex" csvPathname = "datasets/" + csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, header=0, timeoutSecs=1000) print "hdfs parse of", csvPathname, "took", time.time( ) - start, 'secs' start = time.time() print "Saving", csvFilename, 'to HDFS' print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)" print "Unique per-user to avoid permission issues" username = getpass.getuser() # reuse the file name to avoid running out of space csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username) path = "hdfs://" + h2o.nodes[0].hdfs_name_node + "/" + csvPathname h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs) print "export_files of", hex_key, "to", path, "took", time.time( ) - start, 'secs' trial += 1 print "Re-Loading", csvFilename, 'from HDFS' start = time.time() hex_key = "a2.hex" time.sleep(2) d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000) print h2o.dump_json(d) parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, header=0, timeoutSecs=1000) print "hdfs re-parse of", csvPathname, "took", time.time( ) - start, 'secs'
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1,64) # random length headers headerName = ''.join([random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0,1) GZIP_HEADER = random.randint(0,1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k,v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN==" " or SEP_CHAR_GEN==",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0,1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0,1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*' else: pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1==0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf} start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')