def setUpClass(cls): localhost = h2o.decide_if_localhost() h2o.beta_features = True if (localhost): h2o.build_cloud(3, java_heap_GB=1, use_hdfs=True) else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(1,java_heap_GB=4) else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): # maybe fails more reliably with just 2 jvms? h2o.build_cloud(2,java_heap_GB=5) else: h2o_hosts.build_cloud_with_hosts()
def test_Cloud(self): base_port = 54300 ports_per_node = 2 for trials in range(0,5): for tryNodes in range(3,6): sys.stdout.write('.') sys.stdout.flush() start = time.time() # start by cleaning sandbox (in build_cloud). # so nosetest works which doesn't do unit_main # done in build_cloud now ### h2o.write_flatfile(node_count=tryNodes, base_port=base_port) h2o.build_cloud(node_count=tryNodes, java_heap_GB=1, timeoutSecs=30, retryDelaySecs=2, base_port=base_port, use_flatfile=True) print "loop %d: Build cloud of %d in %d s" % (trials, tryNodes, (time.time() - start)) for i in range(2): print "nodes report size: %s consensus: %s expected: %d." % h2o.verify_cloud_size() h2o.tear_down_cloud() # with so many jvms, wait for sticky ports to be freed up..slow os stuff? # changed, to increment the base_port, to avoid reuse immediately time.sleep(1) base_port += ports_per_node * tryNodes
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(3, use_hdfs=True, hdfs_version='cdh3', hdfs_name_node='192.168.1.176') else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=1) else: h2o_hosts.build_cloud_with_hosts(node_count=1)
def setUpClass(cls): # Uses your username specific json: pytest_config-<username>.json # do what my json says, but with my hdfs. hdfs_name_node from the json # I'll set use_hdfs to False here, because H2O won't start if it can't talk to the hdfs # h2o_hosts.build_cloud_with_hosts(use_hdfs=False) h2o.build_cloud(1, java_heap_GB=14, use_hdfs=True, java_extra_args='-verbose:class')
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=4,java_extra_args='-XX:+PrintCompilation') else: h2o_hosts.build_cloud_with_hosts(java_extra_args='-XX:+PrintCompilation')
def testCloud(self): baseport = 54300 ports_per_node = 2 print "\nTest was written because seeing a bigger cloud than we want sometimes" print "You'll see the problem in the cloud in the browser" print "\nWorks if real ip address used. fails with 127.0.0.1 (intermittent)" print "Builds cloud with 3, the extra being a non-127.0.0.1 node (the real ip)" print "Eventually it goes away, around 1 minute?" for trial in range(20): for tryNodes in range(2,3): sys.stdout.write('.') sys.stdout.flush() start = time.time() ### this works ### h2o.build_cloud(use_this_ip_addr="192.168.0.37", # this intermittently fails h2o.build_cloud(use_this_ip_addr="127.0.0.1", node_count=tryNodes, base_port=base_port, java_heap_GB=1, timeoutSecs=15, retryDelaySecs=2) print "trial #%d: Build cloud of %d in %d secs" % (trial, tryNodes, (time.time() - start)) h2o.verify_cloud_size() h2o.tear_down_cloud() # increment the base_port to avoid sticky ports when we do another # we only use two ports now? baseport += ports_per_node * tryNodes
def testAll(self): try: h2o.build_cloud(node_count=2) # we don't have the port or ip configuration here # that util/h2o.py does? Keep this in synch with spawn_h2o there. # also don't have --nosigar here? (ps, stdout, stderr) = h2o.spawn_cmd('junit', [ 'java', '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'), '-Dh2o.arg.name=pytest-'+getpass.getuser(), '-Dh2o.arg.ip='+h2o.get_ip_address(), '-ea', '-jar', h2o.find_file('target/h2o.jar'), '-mainClass', 'org.junit.runner.JUnitCore', # The tests 'water.parser.ParserTest', ]) rc = ps.wait(None) out = file(stdout).read() err = file(stderr).read() if rc is None: ps.terminate() raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) elif rc != 0: raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) finally: h2o.tear_down_cloud()
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(1,java_heap_GB=14, enable_benchmark_log=True) else: h2o_hosts.build_cloud_with_hosts(enable_benchmark_log=True)
def setUpClass(cls): global SEED, localhost localhost = h2o.decide_if_localhost() if localhost: h2o.build_cloud(java_heap_GB=10) else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): global local_host local_host = not 'hosts' in os.getcwd() if (local_host): h2o.build_cloud(2,java_heap_GB=4,java_extra_args='-XX:+PrintCompilation') else: h2o_hosts.build_cloud_with_hosts(java_extra_args='-XX:+PrintCompilation')
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(2,java_heap_GB=10,use_flatfile=True) else: import h2o_hosts h2o_hosts.build_cloud_with_hosts()
def test_B_slow_junit(self): h2o.tear_down_cloud() h2o.build_cloud(node_count=2) # we don't have the port or ip configuration here # that util/h2o.py does? Keep this in synch with spawn_h2o there. # also don't have --nosigar here? (ps, stdout, stderr) = h2o.spawn_cmd('junit', [ 'java', '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'), '-Dh2o.arg.name='+h2o.cloud_name(), '-Dh2o.arg.ip='+h2o.get_ip_address(), '-ea', '-jar', h2o.find_file('target/h2o.jar'), '-mainClass', 'org.junit.runner.JUnitCore', # The tests 'water.ConcurrentKeyTest', 'hex.MinorityClassTest' ]) # getting UDP receiver stack traces if we shut down quickly after Junit # may need to wait a little bit before shutdown? time.sleep(3) rc = ps.wait(None) out = file(stdout).read() err = file(stderr).read() if rc is None: ps.terminate() raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) elif rc != 0: raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if localhost: h2o.build_cloud() else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(3) else: h2o_hosts.build_cloud_with_hosts() h2b.browseTheCloud()
def test_import_covtype_parse_3jvm_fvec(self): h2o.beta_features = True csvFilename = "covtype.data" importFolderPath = "standard" trialMax = 2 for tryHeap in [1]: print "\n", tryHeap,"GB heap, 3 jvms, import folder, then loop parsing 'covtype.data' to unique keys" localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=3, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts(node_count=3, java_heap_GB=tryHeap) for trial in range(trialMax): # import each time, because h2o deletes source file after parse csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + "_" + str(trial) + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() time.sleep(5) # print "Waiting 60 secs for TIME_WAIT sockets to go away" # time.sleep(60) time.sleep(2)
def setUpClass(cls): localhost = h2o.decide_if_localhost() if localhost: # h2o.build_cloud(3, java_heap_GB=4, base_port=54323) h2o.build_cloud(3, java_heap_GB=12, base_port=54323) else: h2o_hosts.build_cloud_with_hosts(base_port=54323)
def setUpClass(cls): global SEED, localhost SEED = h2o.setup_random_seed() if localhost: h2o.build_cloud(2) else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): localhost = h2o.decide_if_localhost() h2o.beta_features = True if localhost: h2o.build_cloud(3, java_heap_GB=1, use_hdfs=True, base_port=54321) else: h2o_hosts.build_cloud_with_hosts(base_port=54321)
def setUpClass(cls): global local_host local_host = not "hosts" in os.getcwd() if local_host: h2o.build_cloud(1, java_heap_GB=1) else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): # assume we're at 0xdata with it's hdfs namenode global localhost localhost = h2o.decide_if_localhost() # hdfs_config='/opt/mapr/conf/mapr-clusters.conf', # # hdfs_name_node='mr-0x1.0xdata.loc:7222') # hdfs_version='mapr2.1.3', if localhost: h2o.build_cloud( 1, java_heap_GB=15, enable_benchmark_log=True, use_maprfs=True, hdfs_version="mapr3.0.1", hdfs_name_node="192.168.1.171:7222", ) else: h2o_hosts.build_cloud_with_hosts( 1, java_heap_GB=15, enable_benchmark_log=True, use_maprfs=True, hdfs_version="mapr3.0.1", hdfs_name_node="192.168.1.171:7222", )
def setUpClass(cls): print "Will build_cloud() with random heap size and do overlapped import folder/parse (groups)" global SEED, localhost SEED = h2o.setup_random_seed() if RANDOM_HEAP: tryHeap = random.randint(4, 28) else: tryHeap = 28 # print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse" print "\n", tryHeap, "GB heap, 1 jvm per host, import, then parse" localhost = h2o.decide_if_localhost() h2o.beta_features = True # for the beta tab in the browser if localhost: h2o.build_cloud( node_count=3, java_heap_GB=4, base_port=54323, # use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3' ) else: h2o_hosts.build_cloud_with_hosts( node_count=1, java_heap_GB=tryHeap, base_port=54321, # use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3' )
def test_A_all_junit(self): try: h2o.build_cloud(node_count=2, java_heap_GB=3) # we don't have the port or ip configuration here # that util/h2o.py does? Keep this in synch with spawn_h2o there. # also don't have --nosigar here? (ps, stdout, stderr) = h2o.spawn_cmd('junit', [ 'java', '-Xms3G', '-Xmx3G', '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'), '-Dh2o.arg.name='+h2o.cloud_name(), '-Dh2o.arg.ip='+h2o.get_ip_address(), '-Dh2o.arg.port=54666', '-ea', '-jar', h2o.find_file('target/h2o.jar'), '-mainClass', 'org.junit.runner.JUnitCore', # The all test suite 'water.suites.AllTestsSuite' ]) rc = ps.wait(None) out = file(stdout).read() err = file(stderr).read() if rc is None: ps.terminate() raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) elif rc != 0: raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err)) finally: h2o.tear_down_cloud()
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(3) else: h2o_hosts.build_cloud_with_hosts() h2o.beta_features = True
def setUpClass(cls): global local_host local_host = not 'hosts' in os.getcwd() if (local_host): h2o.build_cloud(1,java_heap_GB=4) else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=1, java_heap_GB=10, base_port=54333) else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): global localhost localhost = h2o.decide_if_localhost() if localhost: h2o.build_cloud(node_count=1, java_heap_GB=10) else: h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=10)
def test_import_covtype_parse_loop(self): csvFilename = "covtype.data" importFolderPath = "/home/0xdiag/datasets/standard" trialMax = 2 localhost = h2o.decide_if_localhost() for tryHeap in [4, 3, 2, 1]: print "\n", tryHeap, "GB heap, 1 jvms, import folder, then loop parsing 'covtype.data' to unique keys" if (localhost): h2o.build_cloud(node_count=1, java_heap_GB=tryHeap) else: h2o_hosts.build_cloud_with_hosts( node_count=1, java_heap_GB=tryHeap) for trial in range(trialMax): # import each time, because h2o deletes source file after parse h2i.setupImportFolder(None, importFolderPath) key2 = csvFilename + "_" + str(trial) + ".hex" parseKey = h2i.parseImportFolderFile( None, csvFilename, importFolderPath, key2=key2, timeoutSecs=20) # sticky ports? h2o.tear_down_cloud() time.sleep(2)
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=2,java_heap_GB=7) else: h2o_hosts.build_cloud_with_hosts(node_count=1,java_heap_GB=13)
def setUpClass(cls): h2o.build_cloud(1) global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir()
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(1, java_heap_GB=4) else: h2o_hosts.build_cloud_with_hosts()
def test_benchmark_import(self): # typical size of the michal files avgMichalSizeUncompressed = 237270000 avgMichalSize = 116561140 avgSynSize = 4020000 covtype200xSize = 15033863400 synSize = 183 if 1 == 1: # importFolderPath = '/home/0xdiag/datasets/more1_1200_link' # importFolderPathFull = '/home/0xdiag/datasets/manyfiles-nflx-gz' # importFolderPath = 'more1_1200_link' importFolderPath = 'manyfiles-nflx-gz' print "Using .gz'ed files in", importFolderPath # this pattern from browser correctly does 100 files, 1M rowsj # source_key=*/home/0xdiag/datasets/manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz csvFilenameAll = [ ("file_1.dat.gz", "file_1_A.dat.gz", 1 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600), ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600), # ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600), # ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600), ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600), # ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600), # ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600), ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600), ] # csvFilenameList = random.sample(csvFilenameAll,1) csvFilenameList = csvFilenameAll # split out the pattern match and the filename used for the hex trialMax = 1 # rebuild the cloud for each file base_port = 54321 # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?) DO_GLM = False noPoll = False # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack'] # benchmarkLogging = None benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack'] benchmarkLogging = ['cpu', 'disk', 'network', 'iostats'] # IOStatus can hang? benchmarkLogging = ['cpu', 'disk' 'network'] pollTimeoutSecs = 180 retryDelaySecs = 10 localhost = h2o.decide_if_localhost() if localhost: tryHeap = 4 h2o.build_cloud(2, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) else: tryHeap = 28 h2o_hosts.build_cloud_with_hosts(1, java_heap_GB=tryHeap, base_port=base_port, enable_benchmark_log=True) for i, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList): # pop open a browser on the cloud ### h2b.browseTheCloud() # to avoid sticky ports? ### base_port += 2 h2o.beta_features = True for trial in range(trialMax): # (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*") if DO_IMPORT_CHECK: for i in range(2): csvPathname = importFolderPath + "/" + csvFilepattern (importResult, importPattern) = h2i.import_only( bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs) importFullList = importResult['files'] importFailList = importResult['fails'] print "\n Problem if this is not empty: importFailList:", h2o.dump_json( importFailList) # creates csvFilename.hex from file in importFolder dir h2o.cloudPerfH2O.change_logfile(csvFilename) h2o.cloudPerfH2O.message("") h2o.cloudPerfH2O.message( "Parse " + csvFilename + " Start--------------------------------") csvPathname = importFolderPath + "/" + csvFilepattern start = time.time() parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvPathname) if noPoll: if (i + 1) < len(csvFilenameList): h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i + 1] # parseResult = h2i.import_parse(path=importFolderPath + "/" + csvFilepattern, csvPathname = importFolderPathFull + "/" + csvFilepattern start = time.time() parseResult = h2i.import_parse( path=csvPathname, hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect( None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvPathname) if (i + 2) < len(csvFilenameList): h2o.check_sandbox_for_errors() (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i + 2] csvPathname = importFolderPathFull + "/" + csvFilepattern parseResult = h2i.import_parse( path=csvPathname, hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noPoll=noPoll, benchmarkLogging=benchmarkLogging) elapsed = time.time() - start print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) inspect = h2o_cmd.runInspect( None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvPathname) # print stats on all three if noPoll if noPoll: # does it take a little while to show up in Jobs, from where we issued the parse? time.sleep(2) # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel h2o_jobs.pollWaitJobs(pattern=csvFilename, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging) # for getting the MB/sec closer to 'right' totalBytes += totalBytes2 + totalBytes3 elapsed = time.time() - start h2o.check_sandbox_for_errors() if totalBytes is not None: fileMBS = (totalBytes / 1e6) / elapsed l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed) print l h2o.cloudPerfH2O.message(l) print csvFilepattern, 'parse time:', parseResult['response'][ 'time'] print "Parse result['destination_key']:", parseResult[ 'destination_key'] # BUG here? if not noPoll: pass # We should be able to see the parse result? # h2o_cmd.check_enums_from_inspect(parseResult) # the nflx data doesn't have a small enough # of classes in any col # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone? origKey = parseResult['destination_key'] # execExpr = 'a = randomFilter('+origKey+',200,12345678)' execExpr = 'a = slice(' + origKey + ',1,200)' # h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30) # runRF takes the parseResult directly newParseKey = {'destination_key': 'a'} print "\n" + csvFilepattern # poker and the water.UDP.set3(UDP.java) fail issue.. # constrain depth to 25 print "Temporarily hacking to do nothing instead of RF on the parsed file" ### RFview = h2o_cmd.runRF(trees=1,depth=25,parseResult=newParseKey, timeoutSecs=timeoutSecs) ### h2b.browseJsonHistoryAsUrlLastMatch("RFView") #********************************************************************************** # Do GLM too # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive) if DO_GLM: # these are all the columns that are enums in the dataset...too many for GLM! x = range(542) # don't include the output column # remove the output too! (378) for i in [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378 ]: x.remove(i) x = ",".join(map(str, x)) GLMkwargs = { 'x': x, 'y': 378, 'case': 15, 'case_mode': '>', 'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5 } start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs) h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs) elapsed = time.time() - start h2o.check_sandbox_for_errors() l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format( len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) #********************************************************************************** # print "Waiting 30 secs" # time.sleep(30) h2o_cmd.checkKeyDistribution() h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult) h2o.nodes[0].remove_all_keys() ### time.sleep(3600) ### h2o.tear_down_cloud() if not localhost: print "Waiting 30 secs before building cloud again (sticky ports?)" ### time.sleep(30) sys.stdout.write('.') sys.stdout.flush()
def build_cloud_with_hosts(node_count=None, **kwargs): ## if not h2o.disable_time_stamp: ## sys.stdout = h2o.OutWrapper(sys.stdout) # legacy: we allow node_count to be positional. # if it's used positionally, stick in in kwargs (overwrite if there too) if node_count is not None: # we use h2o_per_host in the config file. will translate to node_count for build_cloud kwargs['h2o_per_host'] = node_count # set node_count to None to make sure we don't use it below. 'h2o_per_host' should be used node_count = None # randomizing default base_port used offset = random.randint(0, 31) # for new params: # Just update this list with the param name and default and you're done allParamsDefault = { 'use_flatfile': None, 'use_hdfs': True, # default to true, so when we flip import folder to hdfs+s3n import on ec2, the cloud is built correctly 'hdfs_name_node': None, 'hdfs_config': None, 'hdfs_version': None, 'base_port': None, 'java_heap_GB': None, 'java_heap_MB': None, 'java_extra_args': None, 'timeoutSecs': 60, 'retryDelaySecs': 2, 'cleanup': True, 'slow_connection': False, 'h2o_per_host': 2, 'ip': '["127.0.0.1"]', # this is for creating the hosts list 'base_port': 54300 + offset, 'username': '******', 'password': None, 'rand_shuffle': True, 'use_home_for_ice': False, 'key_filename': None, 'aws_credentials': None, 'redirect_import_folder_to_s3_path': None, 'redirect_import_folder_to_s3n_path': None, 'disable_h2o_log': False, 'enable_benchmark_log': False, 'h2o_remote_buckets_root': None, 'conservative': False, 'create_json': False, # pass this from cloud building to the common "release" h2o_test.py classes # for deciding whether keys should be deleted when a test ends. 'delete_keys_at_teardown': False, 'clone_cloud': False, 'cloud_name': None, 'force_tcp': None, 'random_udp_drop': None, 'sandbox_ignore_errors': None, } # initialize the default values paramsToUse = {} for k, v in allParamsDefault.iteritems(): paramsToUse[k] = allParamsDefault.setdefault(k, v) # allow user to specify the config json at the command line. config_json is a global. if h2o.config_json: configFilename = find_config(h2o.config_json) else: # configs may be in the testdir_hosts configFilename = find_config(h2o.default_hosts_file()) h2o.verboseprint("Loading host config from", configFilename) with open(configFilename, 'rb') as fp: hostDict = json.load(fp) for k, v in hostDict.iteritems(): # Don't take in params that we don't have in the list above # Because michal has extra params in here for ec2! and comments! if k in paramsToUse: paramsToUse[k] = hostDict.setdefault(k, v) # Now overwrite with anything passed by the test # whatever the test passes, always overrules the config json for k, v in kwargs.iteritems(): paramsToUse[k] = kwargs.setdefault(k, v) # Let's assume we should set the h2o_remote_buckets_root (only affects # schema=local), to the home directory of whatever remote user # is being used for the hosts. Better than living with a decision # we made from scanning locally (remote might not match local) # assume the remote user has a /home/<username> (linux targets?) # This only affects import folder path name generation by python tests if paramsToUse['username']: paramsToUse[ 'h2o_remote_buckets_root'] = "/home/" + paramsToUse['username'] h2o.verboseprint("All build_cloud_with_hosts params:", paramsToUse) #******************** global hosts hosts = [] # Update: special case paramsToUse['ip'] = ["127.0.0.1"] and use the normal build_cloud # this allows all the tests in testdir_host to be run with a special config that points to 127.0.0.1 # hosts should be None for everyone if normal build_cloud is desired if paramsToUse['ip'] == ["127.0.0.1"]: hosts = None else: h2o.verboseprint("About to RemoteHost, likely bad ip if hangs") hosts = [] for h in paramsToUse['ip']: h2o.verboseprint("Connecting to:", h) # expand any ~ or ~user in the string key_filename = paramsToUse['key_filename'] if key_filename: # don't try to expand if None key_filename = os.path.expanduser(key_filename) hosts.append( h2o.RemoteHost(addr=h, username=paramsToUse['username'], password=paramsToUse['password'], key_filename=key_filename)) # done with these, don't pass to build_cloud paramsToUse.pop( 'ip' ) # this was the list of ip's from the config file, replaced by 'hosts' to build_cloud # we want to save username in the node info. don't pop # paramsToUse.pop('username') paramsToUse.pop('password') paramsToUse.pop('key_filename') # flatfile is going into sandbox (LOG_DIR) now..so clean it first (will make sandbox dir if it doesn't exist already) h2o.clean_sandbox() # handles hosts=None correctly h2o.write_flatfile( node_count=paramsToUse['h2o_per_host'], # let the env variable H2O_PORT_OFFSET add in there base_port=paramsToUse['base_port'], hosts=hosts, rand_shuffle=paramsToUse['rand_shuffle'], port_offset=h2o.get_port_offset(), ) if hosts is not None: # this uploads the flatfile too h2o.upload_jar_to_remote_hosts( hosts, slow_connection=paramsToUse['slow_connection']) # timeout wants to be larger for large numbers of hosts * h2oPerHost # use 60 sec min, 5 sec per node. timeoutSecs = max(60, 8 * (len(hosts) * paramsToUse['h2o_per_host'])) else: # for 127.0.0.1 case timeoutSecs = 60 paramsToUse.pop('slow_connection') # sandbox gets cleaned in build_cloud # legacy param issue node_count = paramsToUse['h2o_per_host'] paramsToUse.pop('h2o_per_host') print "java_heap_GB", paramsToUse['java_heap_GB'] # don't wipe out or create the sandbox. already did here, and put flatfile there h2o.build_cloud(node_count, hosts=hosts, init_sandbox=False, **paramsToUse)
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(3, use_hdfs=True, hdfs_version='cdh4', hdfs_name_node='172.16.2.176') else: h2o_hosts.build_cloud_with_hosts()
def setUpClass(cls): localhost = h2o.decide_if_localhost() if (localhost): h2o.build_cloud(node_count=4) else: h2o_hosts.build_cloud_with_hosts()