def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "TEST-poker1000.csv", "leads.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", # these can't RF ..output classes not integer? # "bestbuy_test.csv", # "bestbuy_train.csv", "covtype.data", "covtype.4x.shuffle.data", "covtype4x.shuffle.data", "covtype.13x.data", "covtype.13x.shuffle.data", # "covtype.169x.data", # "prostate_2g.csv", # "prostate_long.csv.gz", "prostate_long_1G.csv", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", # "poker_c1s1_testing_refresh.csv", # "3G_poker_shuffle", # "billion_rows.csv.gz", # "poker-hand.1244M.shuffled311M.full.txt", ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameList = [ "airlines_88_08_100lines.csv", ] h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1, parseKey=parseKey, timeoutSecs=2000)
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameList = [ "airlines_88_08_100lines.csv", ] h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
def test_GLM_hdfs_YearPredictionMSD(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] # a browser window too, just because we can h2b.browseTheCloud() validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir h2i.setupImportHdfs() parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=60) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=500, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) ### h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_import_nflx_parse_loop(self): print "Using the -.gz files from hdfs" # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz csvFilename = "file_10.dat.gz" csvFilepattern = "file_1[0-9].dat.gz" trialMax = 2 for tryHeap in [24]: print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=1, java_heap_GB=tryHeap, use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3') else: h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=tryHeap, use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3') # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandbox_ignore_errors = True timeoutSecs = 500 importFolderPath = "/datasets/manyfiles-nflx-gz" for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again importHdfsResult = h2i.setupImportHdfs(path=importFolderPath) hdfsFullList = importHdfsResult['succeeded'] for k in hdfsFullList: key = k['key'] # just print the first tile if 'nflx' in key and 'file_1.dat.gz' in key: # should be hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", key ### print "hdfsFullList:", h2o.dump_json(hdfsFullList) # error if none? self.assertGreater(len(hdfsFullList),8,"Didn't see more than 8 files in hdfs?") key2 = csvFilename + "_" + str(trial) + ".hex" csvFilePattern = 'file_1.dat.gz' # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", time.sleep(5) print "Loading from hdfs:", importFolderPath + "/" + csvFilePattern start = time.time() parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilePattern, path=importFolderPath, key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print hdfsKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_GLM_hdfs_YearPredictionMSD(self): if localhost: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] else: csvFilenameList = [ 'YearPredictionMSD.txt', 'YearPredictionMSD.txt' ] # a browser window too, just because we can h2b.browseTheCloud() validations1= {} coefficients1= {} for csvFilename in csvFilenameList: # creates csvFilename.hex from file in importFolder dir h2i.setupImportHdfs() parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=60) print csvFilename, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvFilename start = time.time() # can't pass lamba as kwarg because it's a python reserved word # FIX! just look at X=0:1 for speed, for now kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1} glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=500, **kwargs) # different when n_foldsidation is used? No trainingErrorDetails? h2o.verboseprint("\nglm:", glm) h2b.browseJsonHistoryAsUrlLastMatch("GLM") GLMModel = glm['GLMModel'] print "GLM time", GLMModel['time'] coefficients = GLMModel['coefficients'] validationsList = GLMModel['validations'] validations = validationsList.pop() # validations['err'] if validations1: h2o_glm.compareToFirstGlm(self, 'err', validations, validations1) else: validations1 = copy.deepcopy(validations) if coefficients1: h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1) else: coefficients1 = copy.deepcopy(coefficients) sys.stdout.write('.') sys.stdout.flush()
def test_hdfs_multi_copies(self): print "\nUse the new regex capabilities for selecting hdfs: try *copies* at /datasets" print "This should match to a folder with about twenty covtype10x?" # pop open a browser on the cloud h2b.browseTheCloud() # defaults to /datasets h2i.setupImportHdfs() parseKey = h2i.parseImportHdfsFile(csvFilename='*covtype10x_copies*', key2='copies.hex', exclude=None, header=None, timeoutSecs=600) print "*copies* regex to hdfs /datasets", 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] sys.stdout.flush()
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "allyears2k.csv", "billion_rows.csv.gz", "covtype.data", "covtype.shuffled.data", "covtype200x.data", "covtype20x.data", "kddcup_1999.data.gz", "rand_logreg_100000000x70.csv.gz", ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs( path='/datasets/standard', schema='maprfs') for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile( csvFilename=csvFilename, path='/datasets/standard', schema='maprfs', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000)
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "allyears2k.csv", "billion_rows.csv.gz", "covtype.data", "covtype.shuffled.data", "covtype200x.data", "covtype20x.data", "kddcup_1999.data.gz", "rand_logreg_100000000x70.csv.gz", ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs(path='/datasets/standard', schema='maprfs') for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets/standard', schema='maprfs', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1, parseKey=parseKey, timeoutSecs=2000)
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "TEST-poker1000.csv", ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud # h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs( path='/datasets', schema='maprfs') for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile( csvFilename=csvFilename, path='/datasets', schema='maprfs', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_hdfs_multi_copies(self): print "\nUse the new regex capabilities for selecting hdfs: try *copies* at /datasets" print "This should match to a folder with about twenty covtype10x?" # pop open a browser on the cloud h2b.browseTheCloud() # defaults to /datasets h2i.setupImportHdfs() parseKey = h2i.parseImportHdfsFile(csvFilename='*covtype10x_copies*', key2='copies.hex', exclude=None, header=None, timeoutSecs=600) print "*copies* regex to hdfs /datasets", 'parse time:', parseKey[ 'response']['time'] print "parse result:", parseKey['destination_key'] sys.stdout.flush()
def test_hdfs_multi_bad_csv(self): print "\nUse the new regex capabilities for selecting hdfs: try *csv* at /datasets" # pop open a browser on the cloud h2b.browseTheCloud() # defaults to /datasets h2i.setupImportHdfs() # path should default to /datasets # One .gz in with non .gz seems to cause a stack trace..so don't match to all (*airlines*). # no..maybe it's just the zero length gz file?. No it doesn't show up in the list of keys? # drwxr-xr-x - earl supergroup 0 2013-07-24 17:55 /datasets/airline.gz # -rw-r--r-- 3 hduser supergroup 12155501626 2013-02-22 17:13 /datasets/airline_116M.csv # -rw-r--r-- 3 hduser supergroup 11349125129 2013-05-03 15:45 /datasets/airlines_1988_2008.csv # -rw-r--r-- 3 hduser supergroup 11349125429 2013-05-01 12:52 /datasets/airlines_1988_2008_shuffled.csv # -rw-r--r-- 3 hduser supergroup 9936 2013-05-01 11:49 /datasets/airlines_88_08_100lines.csv # -rw-r--r-- 3 hduser supergroup 12155501626 2013-02-23 15:59 /datasets/airlines_all.csv # -rw-r--r-- 3 hduser supergroup 133710514626 2013-02-23 15:21 /datasets/airlines_all_11x.csv parseKey = h2i.parseImportHdfsFile(csvFilename="airline_116M.csv", key2="random_csv.hex", timeoutSecs=600) print "*csv* regex to hdfs /datasets", "parse time:", parseKey["response"]["time"] print "parse result:", parseKey["destination_key"] sys.stdout.flush()
def test_hdfs_multi_bad_csv(self): print "\nUse the new regex capabilities for selecting hdfs: try *csv* at /datasets" # pop open a browser on the cloud h2b.browseTheCloud() # defaults to /datasets h2i.setupImportHdfs() # path should default to /datasets # One .gz in with non .gz seems to cause a stack trace..so don't match to all (*airlines*). # no..maybe it's just the zero length gz file?. No it doesn't show up in the list of keys? # drwxr-xr-x - earl supergroup 0 2013-07-24 17:55 /datasets/airline.gz # -rw-r--r-- 3 hduser supergroup 12155501626 2013-02-22 17:13 /datasets/airline_116M.csv # -rw-r--r-- 3 hduser supergroup 11349125129 2013-05-03 15:45 /datasets/airlines_1988_2008.csv # -rw-r--r-- 3 hduser supergroup 11349125429 2013-05-01 12:52 /datasets/airlines_1988_2008_shuffled.csv # -rw-r--r-- 3 hduser supergroup 9936 2013-05-01 11:49 /datasets/airlines_88_08_100lines.csv # -rw-r--r-- 3 hduser supergroup 12155501626 2013-02-23 15:59 /datasets/airlines_all.csv # -rw-r--r-- 3 hduser supergroup 133710514626 2013-02-23 15:21 /datasets/airlines_all_11x.csv parseKey = h2i.parseImportHdfsFile(csvFilename='airline_116M.csv', key2='random_csv.hex', timeoutSecs=600) print "*csv* regex to hdfs /datasets", 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] sys.stdout.flush()
def test_B_hdfs_files(self): # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "3G_poker_shuffle", "TEST-poker1000.csv", # corrupt zip file? # "allstate_claim_prediction_train_set.zip", "and-testing.data", "arcene2_train.both", "arcene_train.both", "bestbuy_test.csv", "bestbuy_train.csv", "billion_rows.csv.gz", "covtype.13x.data", "covtype.13x.shuffle.data", "covtype.169x.data", "covtype.4x.shuffle.data", "covtype.data", "covtype4x.shuffle.data", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", "leads.csv", "prostate_long_1G.csv", ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() timeoutSecs = 1000 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir start = time.time() print 'Parsing', csvFilename parseKey = h2i.parseImportHdfsFile( csvFilename=csvFilename, path='/datasets', timeoutSecs=timeoutSecs, retryDelaySecs=1.0) print csvFilename, '\nparse time (python)', time.time() - start, 'seconds' print csvFilename, '\nparse time (h2o):', parseKey['response']['time'] ### print h2o.dump_json(parseKey['response']) print "parse result:", parseKey['destination_key'] # I use this if i want the larger set in my localdir inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### print h2o.dump_json(inspect) cols = inspect['cols'] # look for nonzero num_missing_values count in each col for i, colDict in enumerate(cols): num_missing_values = colDict['num_missing_values'] if num_missing_values != 0: ### print "%s: col: %d, num_missing_values: %d" % (csvFilename, i, num_missing_values) pass ### print h2o.dump_json(cols[0]) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] row_size = inspect['row_size'] ptype = inspect['type'] value_size_bytes = inspect['value_size_bytes'] response = inspect['response'] ptime = response['time'] print "num_cols: %s, num_rows: %s, row_size: %s, ptype: %s, \ value_size_bytes: %s, response: %s, time: %s" % \ (num_cols, num_rows, row_size, ptype, value_size_bytes, response, ptime) h2b.browseJsonHistoryAsUrlLastMatch("Inspect") print "\n" + csvFilename # start = time.time() # RFview = h2o_cmd.runRFOnly(trees=1,parseKey=parseKey,timeoutSecs=2000) # h2b.browseJsonHistoryAsUrlLastMatch("RFView") # # wait in case it recomputes it # time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_KMeans_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "/datasets/kmeans_big" csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) , ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) , ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) , ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) , ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) , ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) , ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) , ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) , ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) , ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) , ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) , ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) , ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) , ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) , ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) , ] benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu','disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. if FROM_HDFS: importFolderResult = h2i.setupImportHdfs(None, importFolderPath) else: importFolderResult = h2i.setupImportFolder(None, importFolderPath) # PARSE **************************************** print "Parse starting: " + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseKey = h2i.parseImportHdfsFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes/1e6)/elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n"+l h2o.cloudPerfH2O.message(l) # KMeans **************************************** print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'initialization': 'Furthest', 'epsilon': 1e-6, 'cols': None, 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial%3)==0: kwargs['initialization'] = 'PlusPlus' elif (trial%3)==1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
def test_B_load_hdfs_and_store_hex_to_hdfs(self): print "\nLoad a list of files from 0xdata hdfs, parse, and store the .hex to hdfs" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "covtype.data", "TEST-poker1000.csv", "leads.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", "bestbuy_test.csv", "bestbuy_train.csv", "covtype.4x.shuffle.data", "covtype4x.shuffle.data", "covtype.13x.data", "covtype.13x.shuffle.data", "covtype.169x.data", "prostate_2g.csv", "prostate_long.csv.gz", "prostate_long_1G.csv", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", "poker_c1s1_testing_refresh.csv", "3G_poker_shuffle", "billion_rows.csv.gz", "poker-hand.1244M.shuffled311M.full.txt", ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() print "Storing", parseKey['destination_key'], 'to HDFS' ### print "FIX! temporarily disabling since it causes HDFS corruption" storeKey = h2o_cmd.runStore2HDFS(key=parseKey['destination_key'], timeoutSecs=1000) h2b.browseJsonHistoryAsUrlLastMatch("Parse") sys.stdout.write('.') sys.stdout.flush()
def test_B_hdfs_files(self): # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "3G_poker_shuffle", "TEST-poker1000.csv", # corrupt zip file? # "allstate_claim_prediction_train_set.zip", "and-testing.data", "arcene2_train.both", "arcene_train.both", "bestbuy_test.csv", "bestbuy_train.csv", "billion_rows.csv.gz", "covtype.13x.data", "covtype.13x.shuffle.data", "covtype.169x.data", "covtype.4x.shuffle.data", "covtype.data", "covtype4x.shuffle.data", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", "leads.csv", "prostate_long_1G.csv", ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() timeoutSecs = 1000 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir start = time.time() print 'Parsing', csvFilename parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=timeoutSecs, retryDelaySecs=1.0) print csvFilename, '\nparse time (python)', time.time( ) - start, 'seconds' print csvFilename, '\nparse time (h2o):', parseKey['response'][ 'time'] ### print h2o.dump_json(parseKey['response']) print "parse result:", parseKey['destination_key'] # I use this if i want the larger set in my localdir inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) ### print h2o.dump_json(inspect) cols = inspect['cols'] # look for nonzero num_missing_values count in each col for i, colDict in enumerate(cols): num_missing_values = colDict['num_missing_values'] if num_missing_values != 0: ### print "%s: col: %d, num_missing_values: %d" % (csvFilename, i, num_missing_values) pass ### print h2o.dump_json(cols[0]) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] row_size = inspect['row_size'] ptype = inspect['type'] value_size_bytes = inspect['value_size_bytes'] response = inspect['response'] ptime = response['time'] print "num_cols: %s, num_rows: %s, row_size: %s, ptype: %s, \ value_size_bytes: %s, response: %s, time: %s" % \ (num_cols, num_rows, row_size, ptype, value_size_bytes, response, ptime) # h2b.browseJsonHistoryAsUrlLastMatch("Inspect") print "\n" + csvFilename
def test_import_nflx_parse_loop(self): print "Using the -.gz files from hdfs" # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz csvFilename = "file_10.dat.gz" csvFilepattern = "file_1[0-9].dat.gz" trialMax = 2 for tryHeap in [24]: print "\n", tryHeap, "GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=1, java_heap_GB=tryHeap, use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3') else: h2o_hosts.build_cloud_with_hosts( node_count=1, java_heap_GB=tryHeap, use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3') # don't raise exception if we find something bad in h2o stdout/stderr? # h2o.nodes[0].sandbox_ignore_errors = True timeoutSecs = 500 importFolderPath = "/datasets/manyfiles-nflx-gz" for trial in range(trialMax): # since we delete the key, we have to re-import every iteration, to get it again importHdfsResult = h2i.setupImportHdfs(path=importFolderPath) hdfsFullList = importHdfsResult['succeeded'] for k in hdfsFullList: key = k['key'] # just print the first tile if 'nflx' in key and 'file_1.dat.gz' in key: # should be hdfs://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz print "example file we'll use:", key ### print "hdfsFullList:", h2o.dump_json(hdfsFullList) # error if none? self.assertGreater(len(hdfsFullList), 8, "Didn't see more than 8 files in hdfs?") key2 = csvFilename + "_" + str(trial) + ".hex" csvFilePattern = 'file_1.dat.gz' # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", time.sleep(5) print "Loading from hdfs:", importFolderPath + "/" + csvFilePattern start = time.time() parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilePattern, path=importFolderPath, key2=key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print hdfsKey, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_cmd.runStoreView() h2o.tear_down_cloud() # sticky ports? wait a bit. time.sleep(5)
def test_B_load_hdfs_and_store_hex_to_hdfs(self): print "\nLoad a list of files from 0xdata hdfs, parse, and store the .hex to hdfs" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "covtype.data", "TEST-poker1000.csv", "leads.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", "bestbuy_test.csv", "bestbuy_train.csv", "covtype.4x.shuffle.data", "covtype4x.shuffle.data", "covtype.13x.data", "covtype.13x.shuffle.data", "covtype.169x.data", "prostate_2g.csv", "prostate_long.csv.gz", "prostate_long_1G.csv", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", "poker_c1s1_testing_refresh.csv", "3G_poker_shuffle", "billion_rows.csv.gz", "poker-hand.1244M.shuffled311M.full.txt", ] # pick 8 randomly! if (1==0): csvFilenameList = random.sample(csvFilenameAll,8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() print "Storing", parseKey['destination_key'], 'to HDFS' ### print "FIX! temporarily disabling since it causes HDFS corruption" storeKey = h2o_cmd.runStore2HDFS(key=parseKey['destination_key'], timeoutSecs=1000) h2b.browseJsonHistoryAsUrlLastMatch("Parse") sys.stdout.write('.') sys.stdout.flush()
def test_B_hdfs_files(self): print "\nLoad a list of files from HDFS, parse and do 1 RF tree" print "\nYou can try running as hduser/hduser if fail" # larger set in my local dir # fails because classes aren't integers # "allstate_claim_prediction_train_set.zip", csvFilenameAll = [ "TEST-poker1000.csv", "leads.csv", "and-testing.data", "arcene2_train.both", "arcene_train.both", # these can't RF ..output classes not integer? # "bestbuy_test.csv", # "bestbuy_train.csv", "covtype.data", "covtype.4x.shuffle.data", "covtype4x.shuffle.data", "covtype.13x.data", "covtype.13x.shuffle.data", # "covtype.169x.data", # "prostate_2g.csv", # "prostate_long.csv.gz", "prostate_long_1G.csv", "hhp.unbalanced.012.1x11.data.gz", "hhp.unbalanced.012.data.gz", "hhp.unbalanced.data.gz", "hhp2.os.noisy.0_1.data", "hhp2.os.noisy.9_4.data", "hhp_9_14_12.data", # "poker_c1s1_testing_refresh.csv", # "3G_poker_shuffle", # "billion_rows.csv.gz", # "poker-hand.1244M.shuffled311M.full.txt", ] # pick 8 randomly! if (1 == 0): csvFilenameList = random.sample(csvFilenameAll, 8) # Alternatively: do the list in order! Note the order is easy to hard else: csvFilenameList = csvFilenameAll # pop open a browser on the cloud h2b.browseTheCloud() timeoutSecs = 200 # save the first, for all comparisions, to avoid slow drift with each iteration firstglm = {} h2i.setupImportHdfs() for csvFilename in csvFilenameList: # creates csvFilename.hex from file in hdfs dir print "Loading", csvFilename, 'from HDFS' parseKey = h2i.parseImportHdfsFile(csvFilename=csvFilename, path='/datasets', timeoutSecs=1000) print csvFilename, 'parse time:', parseKey['response']['time'] print "parse result:", parseKey['destination_key'] print "\n" + csvFilename start = time.time() RFview = h2o_cmd.runRFOnly(trees=1, parseKey=parseKey, timeoutSecs=2000) h2b.browseJsonHistoryAsUrlLastMatch("RFView") # wait in case it recomputes it time.sleep(10) sys.stdout.write('.') sys.stdout.flush()
def test_KMeans_sphere15_180GB(self): csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv' totalBytes = 183538602156 if FROM_HDFS: importFolderPath = "/datasets/kmeans_big" csvPathname = "hdfs://" + importFolderPath + '/' + csvFilename else: importFolderPath = "/home3/0xdiag/datasets/kmeans_big" csvPathname = importFolderPath + '/' + csvFilename # FIX! put right values in # will there be different expected for random vs the other inits? expected = [ ([ 0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0 ], 248846122, 1308149283316.2988), ([ 0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0 ], 276924291, 1800760152555.98), ([ 0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394 ], 235089554, 375419158808.3253), ([ 0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0 ], 166180630, 525423632323.6474), ([ 0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0 ], 167234179, 1845362026223.1094), ([ 0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985 ], 195420925, 197941282992.43475), ([ 0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0 ], 214401768, 11868360232.658035), ([ 0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907 ], 258853406, 598863991074.3276), ([ 0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0 ], 190979054, 1505088759456.314), ([ 0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0 ], 87794427, 1124697008162.3955), ([ 0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028 ], 78226988, 1151439441529.0215), ([ 0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574 ], 167273589, 693036940951.0249), ([ 0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539 ], 148426180, 35942838893.32379), ([ 0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707 ], 157533313, 88431531357.62982), ([ 0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0 ], 118361306, 1111537045743.7646), ] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk', 'network'] benchmarkLogging = [] for trial in range(6): # IMPORT********************************************** # since H2O deletes the source key, re-import every iteration. if FROM_HDFS: importFolderResult = h2i.setupImportHdfs( None, importFolderPath) else: importFolderResult = h2i.setupImportFolder( None, importFolderPath) # PARSE **************************************** print "Parse starting: " + csvFilename key2 = csvFilename + "_" + str(trial) + ".hex" start = time.time() timeoutSecs = 2 * 3600 kwargs = {} if FROM_HDFS: parseKey = h2i.parseImportHdfsFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) else: parseKey = h2i.parseImportFolderFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed) print "\n" + l h2o.cloudPerfH2O.message(l) # KMeans **************************************** print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?" kwargs = { 'k': 15, 'initialization': 'Furthest', 'epsilon': 1e-6, 'cols': None, 'destination_key': 'junk.hex', # reuse the same seed, to get deterministic results 'seed': 265211114317615310, } if (trial % 3) == 0: kwargs['initialization'] = 'PlusPlus' elif (trial % 3) == 1: kwargs['initialization'] = 'Furthest' else: kwargs['initialization'] = None timeoutSecs = 4 * 3600 params = kwargs paramsString = json.dumps(params) start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial " + str(trial), csvFilename, elapsed, paramsString) print l h2o.cloudPerfH2O.message(l) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseKey, 'd', **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)