def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     h2o.beta_features = True
     if (localhost):
         h2o.build_cloud(3, java_heap_GB=1, use_hdfs=True)
     else:
         h2o_hosts.build_cloud_with_hosts()
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(1,java_heap_GB=4)
     else:
         h2o_hosts.build_cloud_with_hosts()
示例#3
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         # maybe fails more reliably with just 2 jvms?
         h2o.build_cloud(2,java_heap_GB=5)
     else:
         h2o_hosts.build_cloud_with_hosts()
示例#4
0
    def test_Cloud(self):
        base_port = 54300
        ports_per_node = 2
        for trials in range(0,5):
            for tryNodes in range(3,6):
                sys.stdout.write('.')
                sys.stdout.flush()

                start = time.time()
                # start by cleaning sandbox (in build_cloud). 
                # so nosetest works which doesn't do unit_main

                # done in build_cloud now
                ### h2o.write_flatfile(node_count=tryNodes, base_port=base_port)
                h2o.build_cloud(node_count=tryNodes, java_heap_GB=1,
                    timeoutSecs=30, retryDelaySecs=2, base_port=base_port, use_flatfile=True)
                print "loop %d: Build cloud of %d in %d s" % (trials, tryNodes, (time.time() - start)) 

                for i in range(2):
                    print "nodes report size: %s consensus: %s expected: %d." % h2o.verify_cloud_size()

                h2o.tear_down_cloud()
                # with so many jvms, wait for sticky ports to be freed up..slow os stuff?
                # changed, to increment the base_port, to avoid reuse immediately
                time.sleep(1)
                base_port += ports_per_node * tryNodes
示例#5
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(3, 
             use_hdfs=True, hdfs_version='cdh3', hdfs_name_node='192.168.1.176')
     else:
         h2o_hosts.build_cloud_with_hosts()
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(node_count=1)
     else:
         h2o_hosts.build_cloud_with_hosts(node_count=1)
示例#7
0
    def setUpClass(cls):
        # Uses your username specific json: pytest_config-<username>.json

        # do what my json says, but with my hdfs. hdfs_name_node from the json
        # I'll set use_hdfs to False here, because H2O won't start if it can't talk to the hdfs
        # h2o_hosts.build_cloud_with_hosts(use_hdfs=False)
        h2o.build_cloud(1, java_heap_GB=14, use_hdfs=True, java_extra_args='-verbose:class')
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(2,java_heap_GB=4,java_extra_args='-XX:+PrintCompilation')
     else:
         h2o_hosts.build_cloud_with_hosts(java_extra_args='-XX:+PrintCompilation')
示例#9
0
    def testCloud(self):
        baseport = 54300
        ports_per_node = 2

        print "\nTest was written because seeing a bigger cloud than we want sometimes"
        print "You'll see the problem in the cloud in the browser"
        print "\nWorks if real ip address used. fails with 127.0.0.1 (intermittent)"
        print "Builds cloud with 3, the extra being a non-127.0.0.1 node (the real ip)"
        print "Eventually it goes away, around 1 minute?"
        for trial in range(20):
            for tryNodes in range(2,3):
                sys.stdout.write('.')
                sys.stdout.flush()

                start = time.time()
                ### this works
                ### h2o.build_cloud(use_this_ip_addr="192.168.0.37",
                # this intermittently fails
                h2o.build_cloud(use_this_ip_addr="127.0.0.1", 
                    node_count=tryNodes, base_port=base_port, java_heap_GB=1,
                    timeoutSecs=15, retryDelaySecs=2)
                print "trial #%d: Build cloud of %d in %d secs" % (trial, tryNodes, (time.time() - start)) 

                h2o.verify_cloud_size()
                h2o.tear_down_cloud()

                # increment the base_port to avoid sticky ports when we do another
                # we only use two ports now?
                baseport += ports_per_node * tryNodes
示例#10
0
    def testAll(self):
        try:
            h2o.build_cloud(node_count=2)

            # we don't have the port or ip configuration here
            # that util/h2o.py does? Keep this in synch with spawn_h2o there.
            # also don't have --nosigar here?
            (ps, stdout, stderr) = h2o.spawn_cmd('junit', [
                    'java',
                    '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'),
                    '-Dh2o.arg.name=pytest-'+getpass.getuser(),
                    '-Dh2o.arg.ip='+h2o.get_ip_address(),
                    '-ea', '-jar', h2o.find_file('target/h2o.jar'),
                    '-mainClass', 'org.junit.runner.JUnitCore',
                    # The tests
                    'water.parser.ParserTest',
                    ])

            rc = ps.wait(None)
            out = file(stdout).read()
            err = file(stderr).read()
            if rc is None:
                ps.terminate()
                raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
            elif rc != 0:
                raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))

        finally:
            h2o.tear_down_cloud()
示例#11
0
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(1,java_heap_GB=14, enable_benchmark_log=True)
     else:
         h2o_hosts.build_cloud_with_hosts(enable_benchmark_log=True)
示例#12
0
 def setUpClass(cls):
     global SEED, localhost
     localhost = h2o.decide_if_localhost()
     if localhost:
         h2o.build_cloud(java_heap_GB=10)
     else:
         h2o_hosts.build_cloud_with_hosts()
示例#13
0
 def setUpClass(cls):
     global local_host
     local_host = not 'hosts' in os.getcwd()
     if (local_host):
         h2o.build_cloud(2,java_heap_GB=4,java_extra_args='-XX:+PrintCompilation')
     else:
         h2o_hosts.build_cloud_with_hosts(java_extra_args='-XX:+PrintCompilation')
示例#14
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(2,java_heap_GB=10,use_flatfile=True)
     else:
         import h2o_hosts
         h2o_hosts.build_cloud_with_hosts()
示例#15
0
 def test_B_slow_junit(self):
     h2o.tear_down_cloud()
     h2o.build_cloud(node_count=2)
     # we don't have the port or ip configuration here
     # that util/h2o.py does? Keep this in synch with spawn_h2o there.
     # also don't have --nosigar here?
     (ps, stdout, stderr) = h2o.spawn_cmd('junit', [
             'java',
             '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'),
             '-Dh2o.arg.name='+h2o.cloud_name(),
             '-Dh2o.arg.ip='+h2o.get_ip_address(),
             '-ea', '-jar', h2o.find_file('target/h2o.jar'),
             '-mainClass', 'org.junit.runner.JUnitCore',
             # The tests
             'water.ConcurrentKeyTest',
             'hex.MinorityClassTest'
             ])
     # getting UDP receiver stack traces if we shut down quickly after Junit
     # may need to wait a little bit before shutdown?
     time.sleep(3)
     rc = ps.wait(None)
     out = file(stdout).read()
     err = file(stderr).read()
     if rc is None:
         ps.terminate()
         raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
     elif rc != 0:
         raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
示例#16
0
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if localhost:
         h2o.build_cloud()
     else:
         h2o_hosts.build_cloud_with_hosts()
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(3)
     else:
         h2o_hosts.build_cloud_with_hosts()
     h2b.browseTheCloud()
    def test_import_covtype_parse_3jvm_fvec(self):
        h2o.beta_features = True
        csvFilename = "covtype.data"
        importFolderPath = "standard"
        trialMax = 2
        for tryHeap in [1]:
            print "\n", tryHeap,"GB heap, 3 jvms, import folder, then loop parsing 'covtype.data' to unique keys"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(node_count=3, java_heap_GB=tryHeap)
            else:
                h2o_hosts.build_cloud_with_hosts(node_count=3, java_heap_GB=tryHeap)

            for trial in range(trialMax):
                # import each time, because h2o deletes source file after parse
                csvPathname = importFolderPath + "/" + csvFilename
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=20)
            # sticky ports?
            h2o.tear_down_cloud()
            time.sleep(5)

        # print "Waiting 60 secs for TIME_WAIT sockets to go away"
        # time.sleep(60)
        time.sleep(2)
示例#19
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if localhost:
         # h2o.build_cloud(3, java_heap_GB=4, base_port=54323)
         h2o.build_cloud(3, java_heap_GB=12, base_port=54323)
     else:
         h2o_hosts.build_cloud_with_hosts(base_port=54323)
示例#20
0
 def setUpClass(cls):
     global SEED, localhost
     SEED = h2o.setup_random_seed()
     if localhost:
         h2o.build_cloud(2)
     else:
         h2o_hosts.build_cloud_with_hosts()
示例#21
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     h2o.beta_features = True
     if localhost:
         h2o.build_cloud(3, java_heap_GB=1, use_hdfs=True, base_port=54321)
     else:
         h2o_hosts.build_cloud_with_hosts(base_port=54321)
示例#22
0
 def setUpClass(cls):
     global local_host
     local_host = not "hosts" in os.getcwd()
     if local_host:
         h2o.build_cloud(1, java_heap_GB=1)
     else:
         h2o_hosts.build_cloud_with_hosts()
示例#23
0
文件: test_maprfs.py 项目: Jfeng3/h2o
 def setUpClass(cls):
     # assume we're at 0xdata with it's hdfs namenode
     global localhost
     localhost = h2o.decide_if_localhost()
     # hdfs_config='/opt/mapr/conf/mapr-clusters.conf',
     #        # hdfs_name_node='mr-0x1.0xdata.loc:7222')
     #        hdfs_version='mapr2.1.3',
     if localhost:
         h2o.build_cloud(
             1,
             java_heap_GB=15,
             enable_benchmark_log=True,
             use_maprfs=True,
             hdfs_version="mapr3.0.1",
             hdfs_name_node="192.168.1.171:7222",
         )
     else:
         h2o_hosts.build_cloud_with_hosts(
             1,
             java_heap_GB=15,
             enable_benchmark_log=True,
             use_maprfs=True,
             hdfs_version="mapr3.0.1",
             hdfs_name_node="192.168.1.171:7222",
         )
示例#24
0
    def setUpClass(cls):
        print "Will build_cloud() with random heap size and do overlapped import folder/parse (groups)"
        global SEED, localhost
        SEED = h2o.setup_random_seed()
        if RANDOM_HEAP:
            tryHeap = random.randint(4, 28)
        else:
            tryHeap = 28

        # print "\n", tryHeap,"GB heap, 1 jvm per host, import 192.168.1.176 hdfs, then parse"
        print "\n", tryHeap, "GB heap, 1 jvm per host, import,  then parse"
        localhost = h2o.decide_if_localhost()
        h2o.beta_features = True  # for the beta tab in the browser
        if localhost:
            h2o.build_cloud(
                node_count=3,
                java_heap_GB=4,
                base_port=54323,
                # use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3'
            )
        else:
            h2o_hosts.build_cloud_with_hosts(
                node_count=1,
                java_heap_GB=tryHeap,
                base_port=54321,
                # use_hdfs=True, hdfs_name_node='192.168.1.176', hdfs_version='cdh3'
            )
示例#25
0
文件: test_junit.py 项目: EPBaron/h2o
    def test_A_all_junit(self):
        try:
            h2o.build_cloud(node_count=2, java_heap_GB=3)

            # we don't have the port or ip configuration here
            # that util/h2o.py does? Keep this in synch with spawn_h2o there.
            # also don't have --nosigar here?
            (ps, stdout, stderr) = h2o.spawn_cmd('junit', [
                    'java',
                    '-Xms3G',
                    '-Xmx3G',
                    '-Dh2o.arg.ice_root='+h2o.tmp_dir('ice.'),
                    '-Dh2o.arg.name='+h2o.cloud_name(),
                    '-Dh2o.arg.ip='+h2o.get_ip_address(),
                    '-Dh2o.arg.port=54666',
                    '-ea', '-jar', h2o.find_file('target/h2o.jar'),
                    '-mainClass', 'org.junit.runner.JUnitCore',
                    # The all test suite
                    'water.suites.AllTestsSuite'
                   ])

            rc = ps.wait(None)
            out = file(stdout).read()
            err = file(stderr).read()
            if rc is None:
                ps.terminate()
                raise Exception("junit timed out.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))
            elif rc != 0:
                raise Exception("junit failed.\nstdout:\n%s\n\nstderr:\n%s" % (out, err))

        finally:
            h2o.tear_down_cloud()
示例#26
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(3)
     else:
         h2o_hosts.build_cloud_with_hosts()
     h2o.beta_features = True
示例#27
0
 def setUpClass(cls):
     global local_host
     local_host = not 'hosts' in os.getcwd()
     if (local_host):
         h2o.build_cloud(1,java_heap_GB=4)
     else:
         h2o_hosts.build_cloud_with_hosts()
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(node_count=1, java_heap_GB=10, base_port=54333)
     else:
         h2o_hosts.build_cloud_with_hosts()
示例#29
0
 def setUpClass(cls):
     global localhost
     localhost = h2o.decide_if_localhost()
     if localhost:
         h2o.build_cloud(node_count=1, java_heap_GB=10)
     else:
         h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=10)
    def test_import_covtype_parse_loop(self):
        csvFilename = "covtype.data"
        importFolderPath = "/home/0xdiag/datasets/standard"
        trialMax = 2
        localhost = h2o.decide_if_localhost()
        for tryHeap in [4, 3, 2, 1]:
            print "\n", tryHeap, "GB heap, 1 jvms, import folder, then loop parsing 'covtype.data' to unique keys"
            if (localhost):
                h2o.build_cloud(node_count=1, java_heap_GB=tryHeap)
            else:
                h2o_hosts.build_cloud_with_hosts(
                    node_count=1, java_heap_GB=tryHeap)

            for trial in range(trialMax):
                # import each time, because h2o deletes source file after parse
                h2i.setupImportFolder(None, importFolderPath)
                key2 = csvFilename + "_" + str(trial) + ".hex"
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilename,
                    importFolderPath,
                    key2=key2,
                    timeoutSecs=20)
            # sticky ports?
            h2o.tear_down_cloud()
            time.sleep(2)
示例#31
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(node_count=2,java_heap_GB=7)
     else:
         h2o_hosts.build_cloud_with_hosts(node_count=1,java_heap_GB=13)
示例#32
0
 def setUpClass(cls):
     h2o.build_cloud(1)
     global SYNDATASETS_DIR
     SYNDATASETS_DIR = h2o.make_syn_dir()
示例#33
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(1, java_heap_GB=4)
     else:
         h2o_hosts.build_cloud_with_hosts()
示例#34
0
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000
        avgMichalSize = 116561140
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize = 183
        if 1 == 1:
            # importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            # importFolderPathFull = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            # importFolderPath = 'more1_1200_link'
            importFolderPath = 'manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            # this pattern from browser correctly does 100 files, 1M rowsj
            # source_key=*/home/0xdiag/datasets/manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz
            csvFilenameAll = [
                ("file_1.dat.gz", "file_1_A.dat.gz", 1 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz",
                 100 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz",
                 100 * avgMichalSize, 3600),

                # ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
                # ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz",
                 140 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz",
                 140 * avgMichalSize, 3600),

                # ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
                # ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),

                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
            ]

        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 180
        retryDelaySecs = 10

        localhost = h2o.decide_if_localhost()
        if localhost:
            tryHeap = 4
            h2o.build_cloud(2,
                            java_heap_GB=tryHeap,
                            base_port=base_port,
                            enable_benchmark_log=True)
        else:
            tryHeap = 28
            h2o_hosts.build_cloud_with_hosts(1,
                                             java_heap_GB=tryHeap,
                                             base_port=base_port,
                                             enable_benchmark_log=True)

        for i, (csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):
            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2
            h2o.beta_features = True

            for trial in range(trialMax):
                # (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*")

                if DO_IMPORT_CHECK:
                    for i in range(2):
                        csvPathname = importFolderPath + "/" + csvFilepattern
                        (importResult, importPattern) = h2i.import_only(
                            bucket='home-0xdiag-datasets',
                            path=csvPathname,
                            schema='local',
                            timeoutSecs=timeoutSecs)

                        importFullList = importResult['files']
                        importFailList = importResult['fails']
                        print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                            importFailList)
                        # creates csvFilename.hex from file in importFolder dir

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message(
                    "Parse " + csvFilename +
                    " Start--------------------------------")
                csvPathname = importFolderPath + "/" + csvFilepattern
                start = time.time()
                parseResult = h2i.import_parse(
                    bucket='home-0xdiag-datasets',
                    path=csvPathname,
                    schema='local',
                    hex_key=csvFilename + ".hex",
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'],
                                             timeoutSecs=360)
                h2o_cmd.infoFromInspect(inspect, csvPathname)

                if noPoll:
                    if (i + 1) < len(csvFilenameList):
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2,
                         timeoutSecs) = csvFilenameList[i + 1]
                        # parseResult = h2i.import_parse(path=importFolderPath + "/" + csvFilepattern,
                        csvPathname = importFolderPathFull + "/" + csvFilepattern
                        start = time.time()
                        parseResult = h2i.import_parse(
                            path=csvPathname,
                            hex_key=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)
                        elapsed = time.time() - start
                        print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                        inspect = h2o_cmd.runInspect(
                            None,
                            parseResult['destination_key'],
                            timeoutSecs=360)
                        h2o_cmd.infoFromInspect(inspect, csvPathname)

                    if (i + 2) < len(csvFilenameList):
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3,
                         timeoutSecs) = csvFilenameList[i + 2]
                        csvPathname = importFolderPathFull + "/" + csvFilepattern
                        parseResult = h2i.import_parse(
                            path=csvPathname,
                            hex_key=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)
                        elapsed = time.time() - start
                        print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
                            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                        inspect = h2o_cmd.runInspect(
                            None,
                            parseResult['destination_key'],
                            timeoutSecs=360)
                        h2o_cmd.infoFromInspect(inspect, csvPathname)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                                          timeoutSecs=timeoutSecs,
                                          benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                if totalBytes is not None:
                    fileMBS = (totalBytes / 1e6) / elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseResult['response'][
                    'time']
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']

                # BUG here?
                if not noPoll:
                    pass
                    # We should be able to see the parse result?
                    # h2o_cmd.check_enums_from_inspect(parseResult)

                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseResult['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)'
                execExpr = 'a = slice(' + origKey + ',1,200)'
                # h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRF takes the parseResult directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRF(trees=1,depth=25,parseResult=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542)  # don't include the output column
                    # remove the output too! (378)
                    for i in [
                            3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19,
                            20, 424, 425, 426, 540, 541, 378
                    ]:
                        x.remove(i)
                    x = ",".join(map(str, x))

                    GLMkwargs = {
                        'x': x,
                        'y': 378,
                        'case': 15,
                        'case_mode': '>',
                        'max_iter': 10,
                        'n_folds': 1,
                        'alpha': 0.2,
                        'lambda': 1e-5
                    }
                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult,
                                         timeoutSecs=timeoutSecs,
                                         **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************
                # print "Waiting 30 secs"
                # time.sleep(30)

                h2o_cmd.checkKeyDistribution()
                h2i.delete_keys_from_import_result(pattern=csvFilename,
                                                   importResult=importResult)
                h2o.nodes[0].remove_all_keys()

                ### time.sleep(3600)

                ### h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush()
示例#35
0
def build_cloud_with_hosts(node_count=None, **kwargs):
    ## if not h2o.disable_time_stamp:
    ##      sys.stdout = h2o.OutWrapper(sys.stdout)
    # legacy: we allow node_count to be positional.
    # if it's used positionally, stick in in kwargs (overwrite if there too)
    if node_count is not None:
        # we use h2o_per_host in the config file. will translate to node_count for build_cloud
        kwargs['h2o_per_host'] = node_count
        # set node_count to None to make sure we don't use it below. 'h2o_per_host' should be used
        node_count = None

    # randomizing default base_port used
    offset = random.randint(0, 31)
    # for new params:
    # Just update this list with the param name and default and you're done
    allParamsDefault = {
        'use_flatfile': None,
        'use_hdfs':
        True,  # default to true, so when we flip import folder to hdfs+s3n import on ec2, the cloud is built correctly
        'hdfs_name_node': None,
        'hdfs_config': None,
        'hdfs_version': None,
        'base_port': None,
        'java_heap_GB': None,
        'java_heap_MB': None,
        'java_extra_args': None,
        'timeoutSecs': 60,
        'retryDelaySecs': 2,
        'cleanup': True,
        'slow_connection': False,
        'h2o_per_host': 2,
        'ip': '["127.0.0.1"]',  # this is for creating the hosts list
        'base_port': 54300 + offset,
        'username': '******',
        'password': None,
        'rand_shuffle': True,
        'use_home_for_ice': False,
        'key_filename': None,
        'aws_credentials': None,
        'redirect_import_folder_to_s3_path': None,
        'redirect_import_folder_to_s3n_path': None,
        'disable_h2o_log': False,
        'enable_benchmark_log': False,
        'h2o_remote_buckets_root': None,
        'conservative': False,
        'create_json': False,
        # pass this from cloud building to the common "release" h2o_test.py classes
        # for deciding whether keys should be deleted when a test ends.
        'delete_keys_at_teardown': False,
        'clone_cloud': False,
        'cloud_name': None,
        'force_tcp': None,
        'random_udp_drop': None,
        'sandbox_ignore_errors': None,
    }
    # initialize the default values
    paramsToUse = {}
    for k, v in allParamsDefault.iteritems():
        paramsToUse[k] = allParamsDefault.setdefault(k, v)

    # allow user to specify the config json at the command line. config_json is a global.
    if h2o.config_json:
        configFilename = find_config(h2o.config_json)
    else:
        # configs may be in the testdir_hosts
        configFilename = find_config(h2o.default_hosts_file())

    h2o.verboseprint("Loading host config from", configFilename)
    with open(configFilename, 'rb') as fp:
        hostDict = json.load(fp)

    for k, v in hostDict.iteritems():
        # Don't take in params that we don't have in the list above
        # Because michal has extra params in here for ec2! and comments!
        if k in paramsToUse:
            paramsToUse[k] = hostDict.setdefault(k, v)

    # Now overwrite with anything passed by the test
    # whatever the test passes, always overrules the config json
    for k, v in kwargs.iteritems():
        paramsToUse[k] = kwargs.setdefault(k, v)

    # Let's assume we should set the h2o_remote_buckets_root (only affects
    # schema=local), to the home directory of whatever remote user
    # is being used for the hosts. Better than living with a decision
    # we made from scanning locally (remote might not match local)
    # assume the remote user has a /home/<username> (linux targets?)
    # This only affects import folder path name generation by python tests
    if paramsToUse['username']:
        paramsToUse[
            'h2o_remote_buckets_root'] = "/home/" + paramsToUse['username']

    h2o.verboseprint("All build_cloud_with_hosts params:", paramsToUse)

    #********************
    global hosts
    hosts = []
    # Update: special case paramsToUse['ip'] = ["127.0.0.1"] and use the normal build_cloud
    # this allows all the tests in testdir_host to be run with a special config that points to 127.0.0.1
    # hosts should be None for everyone if normal build_cloud is desired
    if paramsToUse['ip'] == ["127.0.0.1"]:
        hosts = None
    else:
        h2o.verboseprint("About to RemoteHost, likely bad ip if hangs")
        hosts = []
        for h in paramsToUse['ip']:
            h2o.verboseprint("Connecting to:", h)
            # expand any ~ or ~user in the string
            key_filename = paramsToUse['key_filename']
            if key_filename:  # don't try to expand if None
                key_filename = os.path.expanduser(key_filename)
            hosts.append(
                h2o.RemoteHost(addr=h,
                               username=paramsToUse['username'],
                               password=paramsToUse['password'],
                               key_filename=key_filename))

    # done with these, don't pass to build_cloud
    paramsToUse.pop(
        'ip'
    )  # this was the list of ip's from the config file, replaced by 'hosts' to build_cloud

    # we want to save username in the node info. don't pop
    # paramsToUse.pop('username')
    paramsToUse.pop('password')
    paramsToUse.pop('key_filename')

    # flatfile is going into sandbox (LOG_DIR) now..so clean it first (will make sandbox dir if it doesn't exist already)
    h2o.clean_sandbox()

    # handles hosts=None correctly
    h2o.write_flatfile(
        node_count=paramsToUse['h2o_per_host'],
        # let the env variable H2O_PORT_OFFSET add in there
        base_port=paramsToUse['base_port'],
        hosts=hosts,
        rand_shuffle=paramsToUse['rand_shuffle'],
        port_offset=h2o.get_port_offset(),
    )

    if hosts is not None:
        # this uploads the flatfile too
        h2o.upload_jar_to_remote_hosts(
            hosts, slow_connection=paramsToUse['slow_connection'])
        # timeout wants to be larger for large numbers of hosts * h2oPerHost
        # use 60 sec min, 5 sec per node.
        timeoutSecs = max(60, 8 * (len(hosts) * paramsToUse['h2o_per_host']))
    else:  # for 127.0.0.1 case
        timeoutSecs = 60
    paramsToUse.pop('slow_connection')

    # sandbox gets cleaned in build_cloud
    # legacy param issue
    node_count = paramsToUse['h2o_per_host']
    paramsToUse.pop('h2o_per_host')
    print "java_heap_GB", paramsToUse['java_heap_GB']
    # don't wipe out or create the sandbox. already did here, and put flatfile there
    h2o.build_cloud(node_count, hosts=hosts, init_sandbox=False, **paramsToUse)
示例#36
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(3, use_hdfs=True, hdfs_version='cdh4', hdfs_name_node='172.16.2.176')
     else:
         h2o_hosts.build_cloud_with_hosts()
示例#37
0
 def setUpClass(cls):
     localhost = h2o.decide_if_localhost()
     if (localhost):
         h2o.build_cloud(node_count=4)
     else:
         h2o_hosts.build_cloud_with_hosts()