示例#1
0
    def test_parse_manyfiles_1(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):
            
            csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # import***************************************** 
            hex_key =  csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
                
            # the import has to overwrite existing keys. no parse
            h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "import", trial, "end ", 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the import"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_50_nongz_fvec(self):
        avgMichalSize = 237270000
        bucket = "home-0xdiag-datasets"
        importFolderPath = "manyfiles-nflx-gz"
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            # ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            ("*[1][0-4][0-9].dat.gz", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern
            hex_key = csvFilename + ".hex"

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            importFullList = importResult["files"]
            importFailList = importResult["fails"]
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            parseResult = h2i.import_parse(
                bucket=bucket, path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=600
            )
            execExpr = "A.hex=%s" % parseResult["destination_key"]
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

            h2o_cmd.runStoreView(timeoutSecs=60)
示例#3
0
    def test_exec2_fast_locks(self):
        csvPathname = 'iris/iris2.csv'
        src_key='iris.csv'
        if not AVOID_BUG:
            # need the key name (pattern) to feed to parse)
            (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                src_key=src_key, timeoutSecs=10)
            # just as a reminder of what these returns look like
            print "importResult:", h2o.dump_json(importResult)
            print "importPattern:", h2o.dump_json(importPattern)
        y = 4

        for trial in range (1, 100):
            if AVOID_BUG:
                # need the key name (pattern) to feed to parse)
                (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                    src_key=src_key, timeoutSecs=10)
                # just as a reminder of what these returns look like
                print "importResult:", h2o.dump_json(importResult)
                print "importPattern:", h2o.dump_json(importPattern)

            # make sure each parse is unique dest key (not in use)
            hex_key = "iris2_" + str(trial) + ".hex"
            # what if we kicked off another parse without waiting for it? I think the src key gets locked
            # so we'd get lock issues on the src_key
            parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key,
                delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10)
            execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)
            
        # just show the jobs still going, if any. maybe none, because short (iris)
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
    def test_50_nongz_fvec(self):
        h2o.beta_features = True
        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        importFolderPath = 'airlines'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)


            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            h2o_cmd.runStoreView(timeoutSecs=60)
示例#5
0
    def test_A_store_view(self):
        # size of H2O store
        store_size = 0
        # import data to have more files in the system
        r = h2i.import_only(bucket='smalldata', path='iris/*')
        store_size += len(r[0]['files'])
        r = h2i.import_only(bucket='smalldata', path='covtype/*')
        store_size += len(r[0]['files'])

        # list all items
        r = h2o.nodes[0].store_view(view=store_size)
        self.assertEqual(store_size, len(r['keys']))

        # list over views including only 3 items
        items_per_page = 3                  # items per page
        pages = (store_size / items_per_page)    # number of pages
        if (store_size % items_per_page != 0): pages += 1
        offset = 0 # running offset
        cnt_items = 0  # counter of returned items
        for p in range(0,pages):
            r = h2o.nodes[0].store_view(offset=offset, view=items_per_page)
            print h2o.dump_json(r)
            cnt_items += len(r['keys']) 
            offset += items_per_page

        self.assertEqual(store_size, cnt_items)
    def test_parse_multi_exclude_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']
        tryList = [
            (300, 100, 'cA', 60, '*x[2-5]*'),
            (310, 200, 'cB', 60, '*x[1,3-5]*'),
            (320, 300, 'cC', 60, '*x[1-2,4-5]*'),
            (330, 400, 'cD', 60, '*x[1-3-5]*'),
            (340, 500, 'cE', 60, '*x[1-4]*'),
            ]

        ## h2b.browseTheCloud()
        cnum = 0
        # create them all first
        for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            for fileN in range(FILENUM):
                csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList)

        for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                print f
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f)

            # pattern match all, then use exclude
            parseResult = h2i.parse_only(pattern="*/syn_*",
                hex_key=hex_key, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs)
            print "parseResult['destination_key']: " + parseResult['destination_key']
            print 'parse time:', parseResult['response']['time']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)


            # FIX! h2o strips one of the headers, but treats all the other files with headers as data
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            print "\n" + parseResult['destination_key'] + ":", \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # all should have rowCount rows (due to the excludePattern
            self.assertEqual(numRows, rowCount*FILENUM, msg=("got numRows: %s. Should be rowCount: %s * FILENUM: %s" % \
                (numRows, rowCount, FILENUM)))
示例#7
0
    def test_cols_enum_multi_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']
        tryList = [
            (300, 100, 'cA', 60, '*x[2-5]*'),
            (310, 200, 'cB', 60, '*x[1,3-5]*'),
            (320, 300, 'cC', 60, '*x[1-2,4-5]*'),
            (330, 400, 'cD', 60, '*x[1-3-5]*'),
            (340, 500, 'cE', 60, '*x[1-4]*'),
            ]

        ## h2b.browseTheCloud()
        cnum = 0
        # create them all first
        for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            for fileN in range(FILENUM):
                csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList)

        for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                print f
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f)

            # pattern match all, then use exclude
            parseResult = h2i.parse_only(pattern="*/syn_*",
                hex_key=hex_key, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs)
            print "parseResult['destination_key']: " + parseResult['destination_key']
            print 'parse time:', parseResult['response']['time']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)


            # FIX! h2o strips one of the headers, but treats all the other files with headers as data
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            print "\n" + parseResult['destination_key'] + ":", \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols)

            # all should have rowCount rows (due to the excludePattern
            self.assertEqual(num_rows, rowCount*FILENUM, msg=("got num_rows: %s. Should be rowCount: %s * FILENUM: %s" % \
                (num_rows, rowCount, FILENUM)))
示例#8
0
    def test_parse_manyfiles_1(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):

            if DO_UNCOMPRESSED:
                csvFilename = "a_1.dat"
            else:
                csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # import*****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()

            # the import has to overwrite existing keys. no parse
            h2i.import_only(bucket='home-0xdiag-datasets',
                            path=csvPathname,
                            schema='put',
                            hex_key=hex_key,
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=10,
                            pollTimeoutSecs=120,
                            doSummary=False)
            elapsed = time.time() - start
            print "import", trial, "end ", 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the import"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            # exec does read lock on all existing keys
            if DO_EXEC:
                # fails
                execExpr = "A.hex=c(0,1)"
                # execExpr="A.hex=0;"
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)
                h2o_cmd.runInspect(key='A.hex')

            print "\nTrying StoreView after the exec "
            h2o_cmd.runStoreView(timeoutSecs=30, view=10000)
            # for node in h2o.nodes:
            #    h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
    def test_50_nongz_fvec(self):
        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx-gz'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            # ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            ("*[1][0-4][0-9].dat.gz", "file_50_A.dat", 50 * avgMichalSize, 1800
             ),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern
            hex_key = csvFilename + ".hex"

            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                importFailList)

            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           timeoutSecs=600)
            execExpr = "A.hex=%s" % parseResult['destination_key']
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

            h2o_cmd.runStoreView(timeoutSecs=60)
示例#10
0
    def test_H_Basic(self):
        # maybe best to extra the key from an import? first?
        # this isn't used much, maybe we don't care about this

        h2i.import_only(path="testdir_multi_jvm/syn_test/syn_header.csv")
        headerKey = h2i.find_key('syn_header.csv')
        # comma 44 is separator
        h2i.import_parse(path="testdir_multi_jvm/syn_test/syn[1-2].csv", header=1, header_from_file=headerKey, separator=44)
    
   
        # symbolic links work
        # ln -s /home/0xdiag/datasets home-0xdiag-datasets
        # lrwxrwxrwx 1 kevin kevin     21 Aug 26 22:05 home-0xdiag-datasets -> /home/0xdiag/datasets
        h2i.import_parse(path="standard/covtype.data", bucket="home-0xdiag-datasets")
示例#11
0
    def test_parse_airline_multi_hdfs_many(self):

        # default
        csvFilename = "hex_10"
        csvFilePattern = '*'  # all files in the folder

        for tryHeap in [24]:
            print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap,
                     random_udp_drop=RANDOM_UDP_DROP,
                     use_hdfs=True,
                     hdfs_name_node=NAME_NODE,
                     hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/airlines_multi"
            csvPathname = importFolderPath + "/" + csvFilePattern
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='hdfs',
                                          timeoutSecs=timeoutSecs,
                                          retryDelaySecs=10,
                                          pollTimeoutSecs=60)

            for trial in range(TRIAL_MAX):
                # each parse now just does one
                csvFilePattern = "*%s.csv" % trial
                # if we want multifile
                # csvFilePattern = "*"

                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                # print "Don't wait for completion. Just load things up!"

                print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?"

                print "Drat. We can't re-import the folder, if there's a parse using one of the source files?"
                parseResult = h2i.parse_only(pattern=csvFilePattern,
                                             hex_key=hex_key,
                                             noPoll=True,
                                             delete_on_done=0,
                                             timeoutSecs=timeoutSecs,
                                             retryDelaySecs=10,
                                             pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30)
            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
示例#12
0
    def test_GLM_mnist_s3n_fvec(self):
        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
            ("mnist_testing.csv.gz",  "mnist_training.csv.gz",    600), 
            ("mnist_training.csv.gz", "mnist_training.csv.gz",    600), 
        ]

        importFolderPath = "mnist"
        csvPathname = importFolderPath + "/*"
        (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=120)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            # PARSE test****************************************
            csvPathname = importFolderPath + "/" + testCsvFilename
            testHexKey = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=testHexKey,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + trainCsvFilename
            trainHexKey = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=trainHexKey,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # GLM****************************************
            y = 0 # first column is pixel value
            print "y:"
            # don't need the intermediate Dicts produced from columnInfoFromInspect
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)
            print "x:", x

            kwargs = {
                'response': y,
                # 'case_mode': '>',
                # 'case': 0,
                'family': 'gaussian',
                'lambda': 1.0E-5,
                'alpha': 0.5,
                'max_iter': 5,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                }

            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "GLM completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
示例#13
0
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs)
            print h2o.dump_json(importResult)
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30)
            # print h2o.dump_json(storeViewResult)

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
示例#14
0
def uploadit(n,
             bucket,
             path,
             src_key,
             hex_key,
             timeoutSecs=60,
             retryDelaySecs=1,
             pollTimeoutSecs=30):
    # apparently the putfile has some conflicts. but afte the put completes, its okay
    # to be parallel with the src_key if it has a different name
    (importResult, importPattern) = h2i.import_only(node=h2o.nodes[n],
                                                    bucket=bucket,
                                                    path=path,
                                                    schema='put',
                                                    src_key=src_key,
                                                    timeoutSecs=timeoutSecs,
                                                    retryDelaySecs=10,
                                                    pollTimeoutSecs=60)
    print "uploadit:", importPattern, hex_key
    # do the parse on the next node
    if UPLOAD_PARSE_DIFF_NODES:
        np1 = (n + 1) % len(h2o.nodes)
    else:
        np1 = n
    if DO_PARSE_ALSO:
        parseit(np1,
                importPattern,
                hex_key,
                timeoutSecs=timeoutSecs,
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs)
        h2o.nodes[0].rebalance(before=hex_key, after=hex_key + "_2", chunks=32)
    return (importPattern, hex_key)
示例#15
0
def uploadit(n, bucket, path, src_key, hex_key, timeoutSecs=60, retryDelaySecs=1, pollTimeoutSecs=30):
    # apparently the putfile has some conflicts. but afte the put completes, its okay
    # to be parallel with the src_key if it has a different name
    (importResult, importPattern) = h2i.import_only(
        node=h2o.nodes[n],
        bucket=bucket,
        path=path,
        schema="put",
        src_key=src_key,
        timeoutSecs=timeoutSecs,
        retryDelaySecs=10,
        pollTimeoutSecs=60,
    )
    print "uploadit:", importPattern, hex_key
    # do the parse on the next node
    if UPLOAD_PARSE_DIFF_NODES:
        np1 = (n + 1) % len(h2o.nodes)
    else:
        np1 = n
    if DO_PARSE_ALSO:
        parseit(
            np1,
            importPattern,
            hex_key,
            timeoutSecs=timeoutSecs,
            retryDelaySecs=retryDelaySecs,
            pollTimeoutSecs=pollTimeoutSecs,
        )
        h2o.nodes[0].rebalance(source=hex_key, after=hex_key + "_2", chunks=32)
    return (importPattern, hex_key)
    def test_parse_airline_multi_hdfs(self):
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, disable_assertions=DISABLE_ASSERTIONS,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            timeoutSecs = 3600
            importFolderPath = "datasets/airlines_multi"

            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                importResult = h2i.import_only(path=csvPathname, schema='hdfs', 
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                print "importResult:", h2o.dump_json(importResult)

                parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_parse_all_s3n_thru_hdfs(self):
        print "\nLoad a list of files from s3n, parse it thru HDFS"
        print "In EC2, michal's config always passes the right config xml"
        print "as arg to the java -jar h2o.jar. Only works in EC2"

        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/*'
        importResult = h2i.import_only(bucket=bucket, path=csvPathname, schema='s3n')
        s3nFullList = importResult['succeeded']
        print "s3nFullList:", h2o.dump_json(s3nFullList)
        self.assertGreater(len(s3nFullList),1,"Didn't see more than 1 files in s3n?")
        s3nList = random.sample(s3nFullList,8)

        timeoutSecs = 500
        for s in s3nList:
            s3nKey = s['key']
            s3nFilename = s['file']
            # there is some non-file key names returned? s3n metadata?
            # only use the keys with csv in their name
            if ('csv' not in s3nKey) or ('syn_dataset' in s3nKey) or ('.gz' in s3nKey):
                continue

            # creates csvFilename.hex from file in hdfs dir 
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            parseResult = h2i.parse_only(pattern=s3nKey, hex_key=s3nFilename + ".hex",
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)

            print "parse result:", parseResult['destination_key']

            start = time.time()
            sys.stdout.flush() 
    def test_exec2_fast_locks_overlap(self):
        csvPathname = "iris/iris2.csv"
        src_key = "iris.csv"
        if not AVOID_BUG:
            # need the key name (pattern) to feed to parse)
            (importResult, importPattern) = h2i.import_only(
                bucket="smalldata", path=csvPathname, schema="put", src_key=src_key, timeoutSecs=10
            )
            # just as a reminder of what these returns look like
            print "importResult:", h2o.dump_json(importResult)
            print "importPattern:", h2o.dump_json(importPattern)
        y = 4

        lastHexKey = None
        for trial in range(1, 100):
            if AVOID_BUG:
                # need the key name (pattern) to feed to parse)
                (importResult, importPattern) = h2i.import_only(
                    bucket="smalldata", path=csvPathname, schema="put", src_key=src_key, timeoutSecs=10
                )
                # just as a reminder of what these returns look like
                print "importResult:", h2o.dump_json(importResult)
                print "importPattern:", h2o.dump_json(importPattern)

            # make sure each parse is unique dest key (not in use)
            hex_key = "iris2_" + str(trial) + ".hex"
            # what if we kicked off another parse without waiting for it? I think the src key gets locked
            # so we'd get lock issues on the src_key
            parseResult = h2i.parse_only(
                pattern=src_key, hex_key=hex_key, noPoll=True, delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10
            )

            # wait until iteration 2, when lastHexKey is available, so you can operate on that
            if lastHexKey:
                execExpr = "%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y + 1, lastHexKey, y + 1, 1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            lastHexKey = hex_key

            # since we are using the same source file, and potentially re-uploading if AVOID_BUG
            # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to
            # use it next iteration
            h2o_jobs.pollWaitJobs(timeoutSecs=10)

        # just show the jobs still going. Shouldn't be any
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
示例#19
0
    def test_H_Basic(self):
        # maybe best to extra the key from an import? first?
        # this isn't used much, maybe we don't care about this

        h2i.import_only(path="testdir_multi_jvm/syn_test/syn_header.csv")
        headerKey = h2i.find_key('syn_header.csv')
        # comma 44 is separator
        h2i.import_parse(path="testdir_multi_jvm/syn_test/syn[1-2].csv",
                         header=1,
                         header_from_file=headerKey,
                         separator=44)

        # symbolic links work
        # ln -s /home/0xdiag/datasets home-0xdiag-datasets
        # lrwxrwxrwx 1 kevin kevin     21 Aug 26 22:05 home-0xdiag-datasets -> /home/0xdiag/datasets
        h2i.import_parse(path="standard/covtype.data",
                         bucket="home-0xdiag-datasets")
示例#20
0
    def test_parse_cust(self):
        # run as user 0xcustomer to get access (with .json config and ssh key file specified)
        importFolderPath = '/mnt/0xcustomer-datasets'
        pollTimeoutSecs = 120
        retryDelaySecs = 30
        timeoutSecs = 300
        
        (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*")
        importFileList = importResult['files']
        importFailList = importResult['fails']
        importKeyList = importResult['keys']
        importDelList = importResult['dels']

        if len(importDelList)!=0:
            raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList))

        if len(importFileList)<MINFILES:
            raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList))

        if len(importKeyList)<MINFILES:
            raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList))

        if len(importFailList)!=0:
            raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList))


        # only parse files with .csv or .tsv in their name (no dirs like that?)
        goodKeyList = [key for key in importKeyList if ('.csv' in key  or '.tsv' in key)]
        trial = 0
        # just do 1?
        for i, importKey in enumerate(random.sample(goodKeyList,3)):
            print "importKey:", importKey
            trial +=1

            start = time.time() 
            # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like
            # force header=0..should mean headers get treated as NAs
            parseResult = h2i.parse_only(pattern=importKey, header=0,
                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "Parse result['destination_key']:", parseResult['destination_key']

            origKey = parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=origKey)
            h2o_cmd.infoFromInspect(inspect, origKey)

            execExpr = 'newKey = '+origKey+'[1,1]'
            h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
            newParseKey = {'destination_key': 'newKey'}

            h2o_cmd.checkKeyDistribution()
            h2o.nodes[0].remove_key(key=origKey)
            # a key isn't created for a scalar
            # h2o.nodes[0].remove_key(key='newKey')
        
        self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
    def test_parse_summary_manyfiles_s3_fvec(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirlist = [("manyfiles-nflx-gz", 800)]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            # change to 50 files
            csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(
                bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs
            )

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets",
                path=csvPathname,
                schema="s3",
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                retryDelaySecs=10,
                pollTimeoutSecs=120,
            )
            elapsed = time.time() - start
            print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_nflx_loop_hdfs_fvec(self):
        h2o.beta_features = True
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz

        # default
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)
            else:
                h2o_hosts.build_cloud_with_hosts(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55600,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 500
            importFolderPath = "datasets/airlines_multi"
            csvPathname = importFolderPath + "/" + csvFilePattern
            parseResult = h2i.import_only(path=csvPathname, schema='hdfs',
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)

            for trial in range(TRIAL_MAX):
                # each parse now just does one
                csvFilePattern = "*%s.csv" % trial
                # if we want multifile
                # csvFilePattern = "*"

                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                # print "Don't wait for completion. Just load things up!"
    
                print "Drat. the source file is locked if we noPoll. Would have to increment across the individual files?"
                
                print "Drat. We can't re-import the folder, if there's a parse using one of the source files?"
                parseResult = h2i.parse_only(pattern=csvFilePattern, hex_key=hex_key, noPoll=True, delete_on_done=0,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=30)
            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
示例#23
0
    def test_parse_manyfiles_1(self):
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):
            
            if DO_UNCOMPRESSED:
                csvFilename = "a_1.dat"
            else:
                csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # import***************************************** 
            hex_key =  csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
                
            # the import has to overwrite existing keys. no parse
            h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "import", trial, "end ", 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the import"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            # exec does read lock on all existing keys
            if DO_EXEC:
                # fails
                execExpr="A.hex=c(0,1)"
                # execExpr="A.hex=0;"
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)
                h2o_cmd.runInspect(key='A.hex')

            print "\nTrying StoreView after the exec "
            h2o_cmd.runStoreView(timeoutSecs=30, view=10000)
            # for node in h2o.nodes:
            #    h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
示例#24
0
    def test_exec2_fast_locks_overlap(self):
        csvPathname = 'iris/iris2.csv'
        src_key='iris.csv'
        if not AVOID_BUG:
            # need the key name (pattern) to feed to parse)
            (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                src_key=src_key, timeoutSecs=10)
            # just as a reminder of what these returns look like
            print "importResult:", h2o.dump_json(importResult)
            print "importPattern:", h2o.dump_json(importPattern)
        y = 4

        lastHexKey = None
        for trial in range (1, 100):
            if AVOID_BUG:
                # need the key name (pattern) to feed to parse)
                (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                    src_key=src_key, timeoutSecs=10)
                # just as a reminder of what these returns look like
                print "importResult:", h2o.dump_json(importResult)
                print "importPattern:", h2o.dump_json(importPattern)

            # make sure each parse is unique dest key (not in use)
            hex_key = "iris2_" + str(trial) + ".hex"
            # what if we kicked off another parse without waiting for it? I think the src key gets locked
            # so we'd get lock issues on the src_key
            parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, noPoll=True,
                delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10)

            # wait until iteration 2, when lastHexKey is available, so you can operate on that
            if lastHexKey:
                execExpr="%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y+1, lastHexKey, y+1, 1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            lastHexKey = hex_key

            # since we are using the same source file, and potentially re-uploading if AVOID_BUG
            # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to 
            # use it next iteration
            h2o_jobs.pollWaitJobs(timeoutSecs=10)
            
        # just show the jobs still going. Shouldn't be any
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
示例#25
0
    def test_parse_summary_airline_s3(self):
        h2o.beta_features = True
        csvFilelist = [
            ("allyears2k.csv",   300), #4.4MB
            ("year1987.csv",     600), #130MB
            ("allyears.csv",     900), #12GB
            # ("allyears_10.csv", 1800), #119.98GB
        ]

        bucket = 'h2o-airlines-unpacked'
        (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3')
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        print "\nTrying StoreView after the import s3"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            # this is schema='local'k
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
示例#26
0
    def test_parse_with_cancel(self):
        mustWait = 10
        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameList = [
            ("standard", "covtype.data", 54),
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            ("standard", "covtype20x.data", 54),
            ("manyfiles-nflx-gz", "file_[100-109].dat.gz", 378),
            ]

        # just loop on the same file. If remnants exist and are locked, we will blow up? 
        # Maybe try to do an inspect to see if either the source key or parse key exist and cause stack traces
        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            hex_key = csvFilename + ".hex"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=50)

            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key,
                timeoutSecs=500, noPoll=True, doSummary=False)
            job_key = parseResult['job_key']

            # give it a little time to start
            time.sleep(3)
            h2o.nodes[0].jobs_cancel(key=job_key)

            # now wait until the job cancels, and we're idle
            h2o_jobs.pollWaitJobs(timeoutSecs=30)
            elapsed = time.time() - start
            print "Cancelled parse completed in", elapsed, "seconds."

            h2o.check_sandbox_for_errors()
            # get a list of keys from storview. 20 is fine..shouldn't be many, since we putfile, not import folder
            # there maybe a lot since we import the whole "standard" folder
            # find the ones that pattern match the csvFilename, and inspect them. Might be none
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=timeoutSecs, view=100)
            keys = storeViewResult['keys']
            for k in keys:
                keyName = k['key']
                print "kevin:", keyName
                if csvFilename in keyName:
                    h2o_cmd.runInspect(key=keyName)
                    h2o.check_sandbox_for_errors()

            # This will tell h2o to delete using the key name from the import file, whatever pattern matches to csvFilename
            # we shouldn't have to do this..the import/parse should be able to overwrite without deleting.
            # h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)

            # If you cancel a parse, you aren't allowed to reparse the same file or import a directory with that file,
            # or cause the key name that the parse would have used, for 5 seconds after the cancel request gets a json
            # response
            print "Waiting", mustWait, "seconds before next reparse-cancel."
            time.sleep(mustWait)
    def test_parse_summary_airline_s3(self):
        csvFilelist = [
            ("allyears2k.csv",   300), #4.4MB
            ("year1987.csv",     600), #130MB
            ("allyears.csv",     900), #12GB
            # ("allyears_10.csv", 1800), #119.98GB
        ]

        bucket = 'h2o-airlines-unpacked'
        (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3')
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        print "\nTrying StoreView after the import s3"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            # this is schema='local'k
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_summary_zip_s3_fvec(self):
        h2o.beta_features = True
        csvFilelist = [
            ("test_set.zip", 300),  # 110.9MB
            ("train_set.zip", 600),  # 362.9MB
        ]

        (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets',
                                                        path="allstate",
                                                        schema='s3')

        print "\nTrying StoreView after the import hdfs"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            csvPathname = "allstate/" + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets',
                                           path=csvPathname,
                                           schema='s3',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
    def test_parse_summary_manyfiles_1_fvec(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles-nflx-gz",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_1.dat.gz"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs)
            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(numCols, 542)
            self.assertEqual(numRows, 100000)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            # pass numRows, so we know when na cnt means row is all na's
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360, 
                numCols=numCols, numRows=numRows)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_50_nongz_fvec(self):
        h2o.beta_features = True
        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        importFolderPath = 'airlines'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                importFailList)

            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                importFailList)

            h2o_cmd.runStoreView(timeoutSecs=60)
    def test_parse_summary_manyfiles_s3n(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs)
            s3nFullList = importHDFSResult['succeeded']
            self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_summary_manyfiles_s3n(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles-nflx-gz",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs)
            s3nFullList = importHDFSResult['succeeded']
            self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
示例#33
0
    def test_50_nongz_fvec(self):
        avgMichalSize = 237270000 * 2
        bucket = 'home-0xdiag-datasets'
        importFolderPath = "many_many"
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            ("*.dat", "file_18_A.dat", 18 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                importFailList)

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=csvFilename + ".hex",
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=retryDelaySecs,
                                           pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, fileMBS, elapsed)
                print msg
    def test_parse_airline_multi_hdfs(self):
        h2o.beta_features = True
        csvFilename = "hex_10"
        csvFilePattern = '*' # all files in the folder

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55930, disable_assertions=DISABLE_ASSERTIONS,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)
            else:
                # why is 55609 already in use?? 
                h2o_hosts.build_cloud_with_hosts(sandbox_ignore_errors=True, force_tcp=True, java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, base_port=55604, disable_assertions=DISABLE_ASSERTIONS,
                    use_hdfs=True, hdfs_name_node=NAME_NODE, hdfs_version=VERSION)

            # don't raise exception if we find something bad in h2o stdout/stderr?
            # h2o.nodes[0].sandboxIgnoreErrors = True

            timeoutSecs = 3600
            importFolderPath = "datasets/airlines_multi"

            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                importResult = h2i.import_only(path=csvPathname, schema='hdfs', 
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                print "importResult:", h2o.dump_json(importResult)

                parseResult = h2i.parse_only(pattern='*csv', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
    def test_parse_airline_multi_hdfs(self):
        csvFilename = "hex_10"
        csvFilePattern = '*'  # all files in the folder

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap, "GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap,
                     random_udp_drop=RANDOM_UDP_DROP,
                     disable_assertions=DISABLE_ASSERTIONS,
                     use_hdfs=True,
                     hdfs_name_node=NAME_NODE,
                     hdfs_version=VERSION)

            timeoutSecs = 3600
            importFolderPath = "datasets/airlines_multi"

            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                importResult = h2i.import_only(path=csvPathname,
                                               schema='hdfs',
                                               timeoutSecs=timeoutSecs,
                                               retryDelaySecs=10,
                                               pollTimeoutSecs=60)
                print "importResult:", h2o.dump_json(importResult)

                parseResult = h2i.parse_only(pattern='*csv',
                                             hex_key=hex_key,
                                             timeoutSecs=timeoutSecs,
                                             retryDelaySecs=10,
                                             pollTimeoutSecs=120)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()
                # we don't delete the hex key. it will start spilling? slow

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
示例#36
0
    def test_parse_with_cancel(self):
        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameList = [
            ("standard", "covtype.data", 54),
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            ("standard", "covtype20x.data", 54),
            ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            ]

        # just loop on the same file. If remnants exist and are locked, we will blow up? 
        # Maybe try to do an inspect to see if either the source key or parse key exist and cause stack traces
        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            hex_key = csvFilename + ".hex"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=50)

            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key,
                timeoutSecs=500, noPoll=True, doSummary=False)
            job_key = parseResult['job_key']

            # give it a little time to start
            time.sleep(3)
            h2o.nodes[0].jobs_cancel(key=job_key)

            # now wait until the job cancels, and we're idle
            h2o_jobs.pollWaitJobs(timeoutSecs=30)
            elapsed = time.time() - start
            print "Cancelled parse completed in", elapsed, "seconds."

            h2o.check_sandbox_for_errors()
            # get a list of keys from storeview. 20 is fine..shouldn't be many, since we putfile, not import folder
            # there maybe a lot since we import the whole "standard" folder
            # find the ones that pattern match the csvFilename, and inspect them. Might be none
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=timeoutSecs, view=100)
            keys = storeViewResult['keys']
            for k in keys:
                keyName = k['key']
                print "kevin:", keyName
                if csvFilename in keyName:
                    h2o_cmd.runInspect(key=keyName)
                    h2o.check_sandbox_for_errors()
    def test_parse_summary_zip_s3_fvec(self):
        h2o.beta_features = True
        csvFilelist = [
            ("test_set.zip",   300), # 110.9MB
            ("train_set.zip",  600), # 362.9MB
        ]

        (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path="allstate", schema='s3')

        print "\nTrying StoreView after the import hdfs"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            csvPathname = "allstate/" + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_all_s3n_thru_hdfs(self):
        print "\nLoad a list of files from s3n, parse it thru HDFS"
        print "In EC2, michal's config always passes the right config xml"
        print "as arg to the java -jar h2o.jar. Only works in EC2"

        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/*'
        importResult = h2i.import_only(bucket=bucket,
                                       path=csvPathname,
                                       schema='s3n')
        s3nFullList = importResult['succeeded']
        print "s3nFullList:", h2o.dump_json(s3nFullList)
        self.assertGreater(len(s3nFullList), 1,
                           "Didn't see more than 1 files in s3n?")
        s3nList = random.sample(s3nFullList, 8)

        timeoutSecs = 500
        for s in s3nList:
            s3nKey = s['key']
            s3nFilename = s['file']
            # there is some non-file key names returned? s3n metadata?
            # only use the keys with csv in their name
            if ('csv' not in s3nKey) or ('syn_dataset'
                                         in s3nKey) or ('.gz' in s3nKey):
                continue

            # creates csvFilename.hex from file in hdfs dir
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            parseResult = h2i.parse_only(pattern=s3nKey,
                                         hex_key=s3nFilename + ".hex",
                                         timeoutSecs=timeoutSecs,
                                         retryDelaySecs=10,
                                         pollTimeoutSecs=60)

            print s3nFilename, 'parse time:', parseResult['response']['time']
            print "parse result:", parseResult['destination_key']

            start = time.time()
            sys.stdout.flush()
示例#39
0
    def test_50_nongz_fvec(self):
        h2o.beta_features = True
        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Parse result['destination_key']:", parseResult['destination_key']
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
示例#40
0
    def parseFile(self, importFolderPath='datasets', csvFilename='airlines_all.csv', 
        timeoutSecs=500, **kwargs):
        csvPathname = importFolderPath + "/" + csvFilename

        start = time.time()
        # do an import first, because we want to get the size of the file
        (importResult, importPattern) = h2i.import_only(path=csvPathname, schema="hdfs", timeoutSecs=timeoutSecs)
        succeeded = importResult['succeeded']
        if len(succeeded) < 1:
            raise Exception("Should have imported at least 1 key for %s" % csvPathname)

        # just do a search
        foundIt = None
        for f in succeeded:
            if csvPathname in f['key']:
                foundIt = f
		print "foundit f:", f
                break

        if not foundIt:
            raise Exception("Should have found %s in the imported keys for %s" % (importPattern, csvPathname))


        parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "Parse of", parseResult['destination_key'], "took", elapsed, "seconds"
        parseResult['python_call_timer'] = elapsed
        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=200)
        elapsed = time.time() - start
        print "Inspect:", parseResult['destination_key'], "took", elapsed, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        num_rows = inspect['num_rows']
        num_cols = inspect['num_cols']
        print "num_rows:", num_rows, "num_cols", num_cols
        return parseResult
示例#41
0
    def test_rf_mnist_both_fvec(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*")
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename,
                hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern,
                hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            print "Not using ignore from this..have to adjust cols?"
            h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True)
            ntree = 2
            params = {
                'response': 'C1',
                # 'ignored_cols_by_name': ignore_x, 
                'ntrees': ntree,
                'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': 'RF_model',
                'nbins': 100,
                'importance': 0,
                'balance_classes': 0,
                }

            if rfSeed is None:
                params['seed'] = random.randint(0,sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True,
                timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            # print 'rfView:', h2o.dump_json(rfView)
            h2o_rf.simpleCheckRFView(None, rfView, **params)
            modelKey = rfView['drf_model']['_key']

            # RFView (score on test)****************************************
            start = time.time()
            # FIX! 1 on oobe causes stack trace?
            kwargs = {'response': y}
            rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, 
                timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs)
            elapsed = time.time() - start
            print "RFView in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            # training and test data are unique, so error won't be low?
            # self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error)

            leaves = {
                'min': rfView['drf_model']['treeStats']['minLeaves'],
                'mean': rfView['drf_model']['treeStats']['meanLeaves'],
                'max': rfView['drf_model']['treeStats']['maxLeaves'],
            }
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 537, 'mean': 1118.05, 'max': 1701}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100
                d = "seed: %s leaves %s %s %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = {
                'min': rfView['drf_model']['treeStats']['minDepth'],
                'mean': rfView['drf_model']['treeStats']['meanDepth'],
                'max': rfView['drf_model']['treeStats']['maxDepth'],
            }
            depthExpected = {'min': 20, 'mean': 20, 'max': 20}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100
                d = "seed: %s depth %s %s %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"
    
        for d in allDelta:
            print d
示例#42
0
    def test_GBM_with_cancels(self):

        print "Sets h2o.beta_features like -bf at command line"
        print "this will redirect import and parse to the 2 variants"
        h2o.beta_features = True

        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameAll = [
            # have to use col name for response?
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            # ("standard", "covtype.data", 54),
            # ("standard", "covtype20x.data", 54),
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            
            ### h2o.beta_features = False

            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50)
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', 
                timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json

            h2o.check_sandbox_for_errors()

            # wait for it to show up in jobs?
            ## time.sleep(2)
            # no pattern waits for all
            ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

            # hack it because no response from Parse2
            if h2o.beta_features:
                parseResult = {'destination_key': 'c.hex'}

            print "\nparseResult", h2o.dump_json(parseResult)

            print "Parse result['destination_key']:", parseResult['destination_key']
            ## What's wrong here? too big?
            ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True)

            h2o.check_sandbox_for_errors()

            # have to avoid this on nflx data. colswap with exec
            # Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified

            if importFolderPath=='manyfiles-nflx-gz':
                if DO_CLASSIFICATION:
                    # need to flip the right col! (R wise)
                    execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1)
                    kwargs = { 'str': execExpr }
                    resultExec = h2o_cmd.runExec(**kwargs)

                # lets look at the response column now
                s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1)
                x = range(542)
                # remove the output too! (378)
                xIgnore = []
                # BUG if you add unsorted 378 to end. remove for now
                for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]:
                    if i not in x:
                        print "x:", x
                        print 'missing?', i
                    x.remove(i)
                    xIgnore.append(i)

                x = ",".join(map(str,x))
                def colIt(x): return "C" + str(x)
                xIgnore = ",".join(map(colIt, xIgnore))
            else:
                # leave one col ignored, just to see?
                xIgnore = 0

            modelKey = "GBMGood"
            params = {
                'destination_key': modelKey,
                'ignored_cols_by_name': xIgnore,
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': "C" + str(response),
                'classification': 1 if DO_CLASSIFICATION else 0,
                'grid_parallelism': 4,
                }

            kwargs = params.copy()
            timeoutSecs = 1800
            start = time.time()
            GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
            print "\nGBMFirstResult:", h2o.dump_json(GBMFirstResult)
            # no pattern waits for all

            for i in range(20):
                # now issue a couple background GBM jobs that we'll kill
                jobids = []     
                for j in range(5):
                    kwargs['destination_key'] = 'GBMBad' + str(j)
                    GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
                    jobids.append(GBMFirstResult['job_key'])

                # have to pass the job id
                for j in jobids:
                    h2o.nodes[0].jobs_cancel(key=j)


            h2o_jobs.pollWaitJobs(pattern='GBMGood', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            elapsed = time.time() - start
            print "GBM training completed in", elapsed, "seconds."

            gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
            # errrs from end of list? is that the last tree?
            errsLast = gbmTrainView['gbm_model']['errs'][-1]

            print "GBM 'errsLast'", errsLast
            if DO_CLASSIFICATION:
                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)
            else:
                print "GBMTrainView:", h2o.dump_json(gbmTrainView['gbm_model']['errs'])

            h2o.check_sandbox_for_errors()

            if DELETE_KEYS:
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
    def test_parse_multi_header_single_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output"

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            (57, 300, 9, 'cA', 60, 0),
            # try with 1-3 data lines in the header file too
            (57, 300, 9, 'cB', 60, 1),
            (57, 300, 9, 'cC', 60, 2),
            (57, 300, 9, 'cD', 60, 3),
        ]

        trial = 0
        for (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) in tryList:
            trial += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            for fileN in range(fileNum):
                csvFilename = 'syn_' + str(fileN) + "_" + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount)
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 rowCount,
                                                 headerData=None,
                                                 rList=rList)
                totalDataRows += dataRowsDone

            # create the header file
            # can make it pass by not doing this
            if HEADER:
                csvFilename = 'syn_header_' + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 dataRowsWithHeader,
                                                 headerData, rList)
                totalDataRows += dataRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = "syn_" + str(trial)
            hex_key = "syn_" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f,
                                schema='put',
                                noPrint=True)
                print f

            if HEADER:
                header = h2i.find_key('syn_header')
                if not header:
                    raise Exception(
                        "Didn't find syn_header* key in the import")

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key = " + header
            start = time.time()
            parseResult = h2i.parse_only(pattern='*' + rowxcol + '*',
                                         hex_key=hex_key,
                                         timeoutSecs=timeoutSecs,
                                         header="1",
                                         header_from_file=header)

            print "parseResult['destination_key']: " + parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['numCols'], totalCols,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['numCols'], totalCols))
            self.assertEqual(inspect['numRows'], totalDataRows,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            if HEADER:
                kwargs = {
                    'sample_rate': 0.75,
                    'max_depth': 25,
                    'ntrees': 1,
                    'ignored_cols_by_name': 'ID,CAPSULE'
                }
            else:
                kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1}

            start = time.time()
            rfv = h2o_cmd.runRF(parseResult=parseResult,
                                timeoutSecs=timeoutSecs,
                                **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
示例#44
0
    def test_parse_nflx_loop_s3n_hdfs(self):
        DO_GLM = True
        DO_GLMGRID = False
        USE_S3 = False
        noPoll = False
        benchmarkLogging = ['jstack','iostats']
        benchmarkLogging = ['iostats']
        benchmarkLogging = []
        # typical size of the michal files
        avgMichalSize = 116561140
        avgSynSize = 4020000
        synSize = 183

        csvFilenameList = [
            (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[1-2][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_A.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_B.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_[123][0-9][0-9].dat.gz", "file_300_C.dat.gz", 300 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 300),
            (["manyfiles-nflx-gz"], "*file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
            (["manyfiles-nflx-gz"], "*file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 900),
            (["manyfiles-nflx-gz"], "*file_[5-9][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_1[0-4][0-9].dat.gz", "file_50_B.dat.gz", 50 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_1[0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            (["manyfiles-nflx-gz"], "*file_2[0-9][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600),
            # beware: the files should be non-overlapping sequentially if noPoll is used, to avoid deleting keys in use    
            (["A-800-manyfiles-nflx-gz"],
                "*file_[0-9]*.dat.gz", "file_A_200_x55.dat.gz", 200 * (avgMichalSize/2), 7200),
            (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz"],
                "*file_[0-9]*.dat.gz", "file_A_400_x55.dat.gz", 400 * (avgMichalSize/2), 7200),
            (["A-800-manyfiles-nflx-gz", "B-800-manyfiles-nflx-gz", "C-800-manyfiles-nflx-gz", "D-800-manyfiles-nflx-gz"],
                "*file_[0-9]*.dat.gz", "file_A_800_x55.dat.gz", 800 * (avgMichalSize/2), 7200),
        ]

        print "Using the -.gz files from s3"
        # want just s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        pollTimeoutSecs = 180
        retryDelaySecs = 10
        # use i to forward reference in the list, so we can do multiple outstanding parses below
        for i, (csvFolderList, csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):

            bucket = "home-0xdiag-datasets"
            ## for tryHeap in [54, 28]:
            h2oPerNode = 1
            # h1.4xlarge 60.5GB dram
            for tryHeap in [28]:
                if USE_S3:
                    protocol = "s3"
                else:
                    protocol = "s3n"
                print "\n", tryHeap,"GB heap,", h2oPerNode, "jvm per host, import", protocol, "then parse"
                
                # jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
                # jea = "-Dh2o.find-ByteBuffer-leaks=true"
                h2o.init(h2oPerNode, java_heap_GB=tryHeap, enable_benchmark_log=True, timeoutSecs=120, retryDelaySecs=10)
                # java_extra_args=jea,

                # don't raise exception if we find something bad in h2o stdout/stderr?
                h2o.nodes[0].sandboxIgnoreErrors = True

                for trial in range(trialMax):
                    # import a list of folders, one at a time (hdfs import can't take pattern match
                    # want to be able to parse 800 files, but only 200 per folder. Don't want to import the full bucket
                    # too slow
                    for csvFolder in csvFolderList:
                        # since we delete the key, we have to re-import every iteration, to get it again
                        # s3n URI thru HDFS is not typical.
                        if USE_S3:
                            (importResult, importPattern) = h2i.import_only(
                                bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='s3')
                        else:
                            (importResult, importPattern) = h2i.import_only(
                                bucket=bucket, path=csvFolder + "/" + csvFilepattern, schema='hdfs')

                        foundKeys = 0
                        for s in importResult['succeeded']:
                            # just print the first tile
                            # if 'nflx' in key and 'file_1.dat.gz' in key: 
                            if csvFilepattern in s['key']:
                                # should be s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_1.dat.gz
                                print "example file we'll use:", s['key']
                                break
                            else:
                                pass
                            foundKeys += 1

                        ### print "s3nFullList:", h2o.dump_json(s3nFullList)
                        # error if none? 
                        self.assertGreater(foundKeys,8,"Didn't see more than 8 files in s3n?")

                    src_key = csvFilepattern
                    hex_key = csvFilename + "_" + str(trial) + ".hex"
                    print "Loading", protocol, "key:", src_key, "to", hex_key
                    start = time.time()
                    parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern,
                        timeoutSecs=timeoutSecs, 
                        retryDelaySecs=retryDelaySecs,
                        pollTimeoutSecs=pollTimeoutSecs,
                        noPoll=noPoll,
                        benchmarkLogging=benchmarkLogging)

                    if noPoll:
                        if (i+1) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                            src_key = csvFilepattern
                            hex_key = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", src_key, "to", hex_key
                            parse2Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvFolder + "/" + csvFilepattern,
                                timeoutSecs=timeoutSecs,
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                        if (i+2) < len(csvFilenameList):
                            time.sleep(1)
                            h2o.check_sandbox_for_errors()
                            (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                            src_key = URI + csvFilepattern
                            hex_key = csvFilename + "_" + str(trial) + ".hex"
                            print "Loading", protocol, "key:", src_key, "to", hex_key
                            parse3Result = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + csvFilepattern,
                                timeoutSecs=timeoutSecs, 
                                retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                noPoll=noPoll,
                                benchmarkLogging=benchmarkLogging)

                    elapsed = time.time() - start
                    print "parse result:", parseResult['destination_key']
                    print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                        "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                    # print stats on all three if noPoll
                    if noPoll:
                        # does it take a little while to show up in Jobs, from where we issued the parse?
                        time.sleep(2)
                        # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                        h2o_jobs.pollWaitJobs(pattern=csvFilename, 
                            timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                        # for getting the MB/sec closer to 'right'
                        totalBytes += totalBytes2 + totalBytes3
                        elapsed = time.time() - start
                        h2o.check_sandbox_for_errors()

                    if totalBytes is not None:
                        fileMBS = (totalBytes/1e6)/elapsed
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} MB/sec for {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, csvFilepattern, csvFilename, fileMBS, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    y = 378
                    if not noPoll:
                        x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)


                    #**********************************************************************************
                    # Do GLM too
                    # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                    if DO_GLM or DO_GLMGRID:
                        # these are all the columns that are enums in the dataset...too many for GLM!
                        x = range(542) # don't include the output column
                        # remove the output too! (378)
                        for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, y]:
                            x.remove(i)
                        x = ",".join(map(str,x))

                        if DO_GLM:
                            algo = 'GLM'
                            GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial',
                                'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                            start = time.time()
                            glm = h2o_cmd.runGLM(parseResult=parseResult, 
                                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging, **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)

                        else:
                            algo = 'GLMGrid'
                            GLMkwargs = {'x': x, 'y': y, 'case': 15, 'case_mode': '>', 'family': 'binomial',
                                'max_iter': 10, 'n_folds': 1, 'beta_epsilon': 1e-4,
                                'lambda': '1e-4',
                                'alpha': '0,0.5',
                                'thresholds': '0.5'
                                }
                            start = time.time()
                            glm = h2o_cmd.runGLMGrid(parseResult=parseResult,
                                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs,
                                pollTimeoutSecs=pollTimeoutSecs,
                                benchmarkLogging=benchmarkLogging, **GLMkwargs)
                            elapsed = time.time() - start
                            h2o_glm.simpleCheckGLMGrid(self, glm, None, **GLMkwargs)

                        h2o.check_sandbox_for_errors()
                        l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:s} {:6.2f} secs'.format(
                            len(h2o.nodes), tryHeap, algo, csvFilepattern, csvFilename, elapsed)
                        print l
                        h2o.cloudPerfH2O.message(l)

                    #**********************************************************************************
                    print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \
                          "Otherwise it would just parse the cached key."
                    ### storeView = h2o.nodes[0].store_view()
                    ### print "storeView:", h2o.dump_json(storeView)
                    # "key": "s3n://home-0xdiag-datasets/manyfiles-nflx-gz/file_84.dat.gz"
                    # have to do the pattern match ourself, to figure out what keys to delete
                    # we're deleting the keys in the initial import. We leave the keys we created
                    # by the parse. We use unique dest keys for those, so no worries.
                    # Leaving them is good because things fill up! (spill)
                    h2o_cmd.checkKeyDistribution()
                    h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)

                h2o.tear_down_cloud()
                # sticky ports? wait a bit.
                print "Waiting 30 secs before building cloud again (sticky ports?)"
                time.sleep(30)
示例#45
0
    def sub_c3_fvec_long(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 116561140
        bucket = "home-0xdiag-datasets"
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = "manyfiles-nflx-gz"
        print "Using .gz'ed files in", importFolderPath
        if len(h2o.nodes) == 1:
            csvFilenameList = [("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600)]
        else:
            csvFilenameList = [
                ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800),
                # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 1800),
            ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ["cpu", "disk", "network"]
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            importFullList = importResult["files"]
            importFailList = importResult["fails"]
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(
                bucket=bucket,
                path=csvPathname,
                schema="local",
                hex_key=csvFilename + ".hex",
                timeoutSecs=timeoutSecs,
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs,
                benchmarkLogging=benchmarkLogging,
            )
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            print "Parse result['destination_key']:", parseResult["destination_key"]
            h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # these are all the columns that are enums in the dataset...too many for GLM!
                x = range(542)  # don't include the output column
                # remove the output too! (378)
                ignore_x = []
                for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541]:
                    x.remove(i)
                    ignore_x.append(i)
                x.remove(378)

                # add one since we are no longer 0 based offset
                x = ",".join(map(lambda x: "C" + str(x + 1), x))
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    "ignored_cols": ignore_x,
                    "response": "C379",
                    "max_iter": 4,
                    "n_folds": 1,
                    "family": "binomial",
                    "alpha": 0.2,
                    "lambda": 1e-5,
                }

                # convert to binomial
                execExpr = "A.hex=%s" % parseResult["destination_key"]
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ("C379", "C379", 15)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                aHack = {"destination_key": "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = "{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
示例#46
0
    def test_exec_enums_rand_cut2(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            # (n, 10, 9, 'cE', 300),
            (n, 1, 1, 'cE', 300),
        ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                MAX_COLS_IN_EXPR = iColCount
                cols = random.sample(range(MAX_COLS_IN_EXPR),
                                     random.randint(1, MAX_COLS_IN_EXPR))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings
                    if 1 == 1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice

                cutExprList = []
                for i, c in enumerate(cutValue):
                    if c is None:
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # randomly pick == or !=
                        if random.randint(0, 1) == 0:
                            cutExprList.append('p$C' + str(i + 1) + '!=' + c)
                        else:
                            cutExprList.append('p$C' + str(i + 1) + '==' + c)

                cutExpr = ' & '.join(cutExprList)
                # print "cutExpr:", cutExpr

                # just extract one output col (the first one)
                rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount + 1)
                # print "rowExpr:", rowExpr
                print rowExpr
                rowExprList.append(rowExpr)

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              iColCount,
                              oColCount,
                              SEEDPERFILE,
                              colEnumList=colEnumList)

            # PARSE*******************************************************

            src_key = csvFilename
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='A' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='B' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='C' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='D' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='E' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='F' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='G' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='H' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='I' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='J' + src_key,
                                          timeoutSecs=200)

            parseResult = h2i.parse_only(pattern='*' + src_key,
                                         hex_key=hex_key,
                                         timeoutSecs=800)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            pNumRows = inspect['numRows']
            pNumCols = inspect['numCols']
            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception(
                    "Probably got a col NA'ed and constant values as a result %s"
                    % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1 == 1:
                a = 'a=c(1,2,3);' + ';'.join(
                    ['a[,%s]=a[,%s-1]' % (i, i) for i in range(2, colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey),
                                                print_params=False)
                    ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(CUT_LOOP_CNT):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0, iColCount - 1)
                randOCol = random.randint(iColCount, iColCount + oColCount - 1)

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                start = time.time()
                h2o.nodes[0].exec_query(str="%s=%s" %
                                        (fKey, random.choice(rowExprList)))
                elapsed = time.time() - start
                execTime = elapsed
                print "exec 2 took", elapsed, "seconds."

                inspect = h2o_cmd.runInspect(key=fKey)
                h2o_cmd.infoFromInspect(inspect, fKey)
                numRows = inspect['numRows']
                numCols = inspect['numCols']

                if numRows == 0 or numCols != colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows,
                                  "rows and", numCols,
                                  "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                column = 0
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey,
                                           column=column,
                                           quantile=quantile,
                                           max_qbins=MAX_QBINS,
                                           multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)

        #****************************************************************
        # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET
        print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col"
        quantile = 0.5 if DO_MEDIAN else .999
        # first output col. always fed by an exec cut, so 0?
        column = iColCount
        start = time.time()
        q = h2o.nodes[0].quantiles(source_key=hex_key,
                                   column='C' + str(iColCount + 1),
                                   quantile=quantile,
                                   max_qbins=MAX_QBINS,
                                   multiple_pass=0)
        elapsed = time.time() - start
        h2p.red_print(
            hex_key, pNumRows,
            "rows Baseline: quantile single col (C" + str(iColCount + 1) + ")",
            "one iteration", elapsed, "secs. threshold:", quantile,
            q['result'])
        print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.'
        quantileTime = elapsed

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList,
                              xLabel,
                              eListTitle,
                              eList,
                              eLabel,
                              fListTitle,
                              fList,
                              fLabel,
                              server=True)
示例#47
0
            csvWrt.writerow(row)
        finally:
            output.close()


if __name__ == '__main__':
    debug = sys.argv.pop(-1)
    build = sys.argv.pop(-1)
    h2o.parse_our_args()
    h2o_hosts.build_cloud_with_hosts(enable_benchmark_log=False)

    #AIRLINES
    airlinesTestParseStart = time.time()
    hK = "AirlinesHeader.csv"
    headerPathname = "bench/Airlines" + "/" + hK
    h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
    headerKey = h2i.find_key(hK)
    testFile = h2i.import_parse(bucket='home-0xdiag-datasets',
                                path='bench/Airlines/AirlinesTest.csv',
                                schema='local',
                                hex_key="atest.hex",
                                header=1,
                                header_from_file=headerKey,
                                separator=44,
                                timeoutSecs=4800,
                                retryDelaySecs=5,
                                pollTimeoutSecs=4800)
    elapsedAirlinesTestParse = time.time() - airlinesTestParseStart
    row = {'testParseWallTime': elapsedAirlinesTestParse}
    response = 'IsDepDelayed'
    ignored = None
示例#48
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # output 378 can't be in this
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 10, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    # execExpr="A.hex=%s" % parseResult['destination_key']
                    # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
示例#49
0
    def test_RF_mnist_both(self):
        h2o.beta_features = True
        importFolderPath = "mnist"
        csvFilelist = [
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
            # to see results a 2nd time
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None,
             '*mnist*gz'),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        (importFolderResult,
         importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
                                          path=importFolderPath + "/*")
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        if 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            succeededList = importFolderResult['succeeded']

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        allDelta = []
        for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed,
             parsePattern) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           testCsvFilename,
                                           hex_key=testKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training"
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           parsePattern,
                                           hex_key=trainKey2,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # RF+RFView (train)****************************************
            # print "This is the 'ignore=' we'll use"
            # no longer use. depend on h2o to get it right.
            ntree = 25
            params = {
                'response': 0,
                'ntrees': ntree,
                # 'data_key='mnist_training.csv.hex'
                'mtries':
                28,  # fix because we ignore some cols, which will change the srt(cols) calc?
                'max_depth': 2147483647,
                'select_stat_type': 'ENTROPY',
                'sampling_strategy': 'RANDOM',
                'sample_rate': 0.67,
                'oobee': 1,
                # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77',
                'destination_key': 'RF_model',
                'nbins': 1024,
                # 'seed': 784834182943470027,
                # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0',
            }

            if rfSeed is None:
                params['seed'] = random.randint(0, sys.maxint)
            else:
                params['seed'] = rfSeed
            print "RF seed:", params['seed']

            kwargs = params.copy()
            print "Trying rf"
            timeoutSecs = 1800
            start = time.time()
            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        timeoutSecs=timeoutSecs,
                                        pollTimeoutSecs=180,
                                        retryDelaySecs=2,
                                        **kwargs)
            elapsed = time.time() - start
            print "RF completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            # RFView (score on test)****************************************
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params)
            # was 2.84
            # sometimes get 2.87?
            self.assertAlmostEqual(
                classification_error,
                1.6,
                delta=1.6,
                msg="Classification error %s differs too much" %
                classification_error)

            treeStats = rfView['speedrf_model']['treeStats']
            leaves = {
                'min': treeStats['minLeaves'],
                'mean': treeStats['meanLeaves'],
                'max': treeStats['maxLeaves']
            }
            # Expected values are from this case:
            # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027),
            leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148}
            for l in leaves:
                # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l]))
                delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100
                d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (
                    params['seed'], l, leaves[l], leavesExpected[l], delta)
                print d
                allDelta.append(d)

            depth = {
                'min': treeStats['minDepth'],
                'mean': treeStats['meanDepth'],
                'max': treeStats['maxDepth']
            }
            depthExpected = {'min': 21, 'mean': 23.8, 'max': 25}
            for l in depth:
                # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l]))
                delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100
                d = "seed: %s %s depth: %s expected: %s pct. different %s" % (
                    params['seed'], l, depth[l], depthExpected[l], delta)
                print d
                allDelta.append(d)

            # Predict (on test)****************************************
            start = time.time()
            modelKey = rfView['speedrf_model']['_key']
            predict = h2o.nodes[0].generate_predictions(
                model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "generate_predictions in",  elapsed, "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        # Done *******************************************************
        print "\nShowing the results again from all the trials, to see variance"

        for d in allDelta:
            print d
示例#50
0
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            importResult = h2i.import_only(bucket='home-0xdiag-datasets',
                                           path="*",
                                           timeoutSecs=timeoutSecs)
            print h2o.dump_json(importResult)
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30)
            # print h2o.dump_json(storeViewResult)

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values,
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(
                y=0, key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"

            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt",
                         "w")
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
示例#51
0
    def test_GBM_cancel_model_reuse(self):
        h2o.beta_features = True
        importFolderPath = 'standard'
        timeoutSecs = 500
        csvFilenameAll = [
            # have to use col name for response?
            ("manyfiles-nflx-gz", "file_1.dat.gz", 378),
            # ("manyfiles-nflx-gz", "file_[1-9].dat.gz", 378),
            # ("standard", "covtype.data", 54),
            # ("standard", "covtype20x.data", 54),
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for (importFolderPath, csvFilename, response) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename 
            print "FIX! is this guy getting cancelled because he's reusing a key name? but it should be okay?"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', 
                timeoutSecs=50)
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex', 
                timeoutSecs=500, noPoll=False, doSummary=False) # can't do summary until parse result is correct json

            h2o.check_sandbox_for_errors()

            # wait for it to show up in jobs?
            ## time.sleep(2)
            # no pattern waits for all
            ## h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            # print "\nparseResult", h2o.dump_json(parseResult)
            print "Parse result['destination_key']:", parseResult['destination_key']
            ## What's wrong here? too big?
            ### inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30, verbose=True)

            h2o.check_sandbox_for_errors()

            # have to avoid this on nflx data. colswap with exec
            # Exception: rjson error in gbm: Argument 'response' error: 
            # Only integer or enum/factor columns can be classified

            if DO_CLASSIFICATION:
                # need to flip the right col! (R wise)
                execExpr = 'c.hex[,%s]=c.hex[,%s]>15' % (response+1,response+1)
                kwargs = { 'str': execExpr }
                resultExec = h2o_cmd.runExec(**kwargs)

            # lets look at the response column now
            s = h2o_cmd.runSummary(key="c.hex", cols=response, max_ncols=1)
            # x = range(542)
            # remove the output too! (378)
            ignoreIndex = [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, response]
            # have to add 1 for col start with 1, now. plus the C
            xIgnore = ",".join(["C" + str(i+1) for i in ignoreIndex])

            params = {
                'destination_key': None,
                'ignored_cols_by_name': xIgnore,
                'learn_rate': .1,
                'ntrees': 2,
                'max_depth': 8,
                'min_rows': 1,
                'response': "C" + str(response+1),
                'classification': 1 if DO_CLASSIFICATION else 0,
                'grid_parallelism': 4,
                }

            kwargs = params.copy()
            timeoutSecs = 1800

            for i in range(5):
                # now issue a couple background GBM jobs that we'll kill
                jobids = []     
                for j in range(5):
                    # FIX! apparently we can't reuse a model key after a cancel
                    kwargs['destination_key'] = 'GBMBad' + str(j)
                    # rjson error in poll_url: Job was cancelled by user!
                    GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, **kwargs)
                    jobids.append(GBMFirstResult['job_key'])
                    h2o.check_sandbox_for_errors()
                    
                # have to pass the job id
                # for j in jobids:
                #     h2o.nodes[0].jobs_cancel(key=j)

                h2o_jobs.cancelAllJobs()
                # PUB-361. going to wait after cancel before reusing keys
                time.sleep(3)
                # am I getting a subsequent parse job cancelled?
                h2o_jobs.showAllJobs()

            if DELETE_KEYS:
                h2i.delete_keys_from_import_result(pattern=csvFilename, importResult=importResult)
示例#52
0
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees,
          depth, minrows, nbins, learnRate, response, row):
    bench = "bench"
    if debug:
        print "Doing GBM DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    for f in fs['train']:
        overallWallStart = time.time()
        pre = ""
        if debug: pre = 'DEBUG'
        gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv'
        if not os.path.exists(gbmbenchcsv):
            output = open(gbmbenchcsv, 'w')
            output.write(','.join(csv_header) + '\n')
        else:
            output = open(gbmbenchcsv, 'a')
        csvWrt = csv.DictWriter(output,
                                fieldnames=csv_header,
                                restval=None,
                                dialect='excel',
                                extrasaction='ignore',
                                delimiter=',')
        try:
            java_heap_GB = h2o.nodes[0].java_heap_GB
            importFolderPath = bench + folderPath
            if (f in [
                    'AirlinesTrain1x', 'AllBedroomsTrain1x',
                    'AllBedroomsTrain10x', 'AllBedroomsTrain100x',
                    'CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x'
            ]):
                csvPathname = importFolderPath + "/" + f + '.csv'
            else:
                csvPathname = importFolderPath + "/" + f + "/*linked*"
            hex_key = f + '.hex'
            hK = folderPath + "Header.csv"
            headerPathname = importFolderPath + "/" + hK
            h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
            headerKey = h2i.find_key(hK)
            trainParseWallStart = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           header=1,
                                           header_from_file=headerKey,
                                           separator=44,
                                           timeoutSecs=7200,
                                           retryDelaySecs=5,
                                           pollTimeoutSecs=7200)

            parseWallTime = time.time() - trainParseWallStart
            print "Parsing training file took ", parseWallTime, " seconds."

            inspect_train = h2o.nodes[0].inspect(
                parseResult['destination_key'])
            inspect_test = h2o.nodes[0].inspect(testFilehex)

            nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(
                h2o_hosts.hosts)
            row.update({
                'h2o_build': build,
                'nMachines': nMachines,
                'nJVMs': len(h2o.nodes),
                'Xmx/JVM': java_heap_GB,
                'dataset': f,
                'nTrainRows': inspect_train['numRows'],
                'nTestRows': inspect_test['numRows'],
                'nCols': inspect_train['numCols'],
                'trainParseWallTime': parseWallTime,
                'classification': classification,
            })

            params = {
                'destination_key': 'GBM(' + f + ')',
                'response': response,
                'ignored_cols_by_name': ignored_cols,
                'classification': classification,
                'validation': testFilehex,
                'ntrees': ntrees,
                'max_depth': depth,
                'min_rows': minrows,
                'nbins': nbins,
                'learn_rate': learnRate,
            }

            kwargs = params.copy()
            gbmStart = time.time()
            #TODO(spencer): Uses jobs to poll for gbm completion
            h2o.beta_features = True
            gbm = h2o_cmd.runGBM(parseResult=parseResult,
                                 noPoll=True,
                                 timeoutSecs=4800,
                                 **kwargs)
            h2o_jobs.pollWaitJobs(timeoutSecs=7200,
                                  pollTimeoutSecs=120,
                                  retryDelaySecs=5)
            h2o.beta_features = False
            gbmTime = time.time() - gbmStart
            row.update({
                'gbmBuildTime': gbmTime,
            })
            #TODO(spencer): Add in gbm scoring
            #gbmScoreStart = time.time()
            #gbmScore      = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
            #scoreTime     = time.time() - gbmScoreStart
            csvWrt.writerow(row)
        finally:
            output.close()
示例#53
0
    def sub_c2_rel_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 116561140 
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx-gz'
        print "Using .gz'ed files in", importFolderPath
        if len(h2o.nodes)==1:
            csvFilenameList= [
                ("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600),
            ]
        else:
            csvFilenameList= [
                ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800),
                # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    ignore_x = []
                    # for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]:
                    for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]:
                        x.remove(i)
                        ignore_x.append(i)

                    # increment by one, because we are no long zero offset!
                    x = ",".join(map(lambda x: "C" + str(x+1), x))
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'family': 'binomial',
                        'x': x,
                        'y': 'C379', 
                        'case': 15, 
                        'case_mode': '>',
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
示例#54
0
    def test_hdfs_cdh4_fvec(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle"
            ("and-testing.data", 60),
            ### "arcene2_train.both",
            ### "arcene_train.both",
            ### "bestbuy_test.csv",
            ("covtype.data", 60),
            ("covtype4x.shuffle.data", 60),
            # "four_billion_rows.csv",
            ("hhp.unbalanced.012.data.gz", 60),
            ("hhp.unbalanced.data.gz", 60),
            ("leads.csv", 60),
            # ("covtype.169x.data", 600),
            ("prostate_long_1G.csv", 600),
            # ("airlines_all.csv", 900),
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        trial = 0
        print "try importing /tmp2"
        d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000)
        print h2o.dump_json(d)
        d = h2i.import_only(path="datasets/*", schema='hdfs', timeoutSecs=1000)
        print h2o.dump_json(d)
        for (csvFilename, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            start = time.time()
            hex_key = "a.hex"
            csvPathname = "datasets/" + csvFilename
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=hex_key,
                                           header=0,
                                           timeoutSecs=1000)
            print "hdfs parse of", csvPathname, "took", time.time(
            ) - start, 'secs'

            start = time.time()
            print "Saving", csvFilename, 'to HDFS'

            print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)"
            print "Unique per-user to avoid permission issues"
            username = getpass.getuser()
            # reuse the file name to avoid running out of space
            csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username)

            path = "hdfs://" + h2o.nodes[0].hdfs_name_node + "/" + csvPathname
            h2o.nodes[0].export_files(src_key=hex_key,
                                      path=path,
                                      force=1,
                                      timeoutSecs=timeoutSecs)
            print "export_files of", hex_key, "to", path, "took", time.time(
            ) - start, 'secs'
            trial += 1

            print "Re-Loading", csvFilename, 'from HDFS'
            start = time.time()
            hex_key = "a2.hex"
            time.sleep(2)
            d = h2i.import_only(path=csvPathname,
                                schema='hdfs',
                                timeoutSecs=1000)
            print h2o.dump_json(d)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=hex_key,
                                           header=0,
                                           timeoutSecs=1000)
            print "hdfs re-parse of", csvPathname, "took", time.time(
            ) - start, 'secs'
    def test_parse_multi_header_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        allowedLetters = 'abcdeABCDE01234[]'
        headerChoices = []
        for n in range(500): # max # of cols below is 500
            done = False
            while not done:
                l = random.randint(1,64) # random length headers
                headerName = ''.join([random.choice(allowedLetters) for _ in range(l)])
                # we keep trying if we already have that header name. Has to be unique.
                done = headerName not in headerChoices
            headerChoices.append(headerName)

        tryList = [
            (3, 5, 9, 'cA', 60, 0),
            # (3, 5, 25, 'cA', 60, 0),
            # (10, 100, 500, 'cA', 60, 0),
            ]

        for trial in range(20):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1 # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0
            # random selection of parse param choices

            # HEADER_HAS_HDR_ROW = random.randint(0,1)
            HEADER_HAS_HDR_ROW = 1
            
            DATA_HAS_HDR_ROW = random.randint(0,1)
            PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1)
            # DATA_FIRST_IS_COMMENT = random.randint(0,1)
            # HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            # FIX! doesn't seem to like just comment in the header file
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0
            
            GZIP_DATA = random.randint(0,1)
            GZIP_HEADER = random.randint(0,1)
            SEP_CHAR_GEN = random.choice(paramsDict['separator'])

            HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator'])
            if HEADER_SEP_CHAR_GEN == 'same':
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # don't put a header in a data file with a different separator?
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # Hack: if both data and header files have a header, then, just in case
            # the header and data files should have the same separator
            # if they don't, make header match data
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # New for fvec? if separators are not the same, then the header separator needs to be comma
            if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN:
                HEADER_SEP_CHAR_GEN = ','


            # screw it. make them always match
            HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            if HEADER_SEP_CHAR_GEN in (',', ' '):
                pass
                # extra spaces? Don't add any
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " "

            kwargs = {}
            for k,v in paramsDict.items():
                kwargs[k] = random.choice(v)

            kwargs['separator'] = SEP_CHAR_GEN
            # parse doesn't auto-detect tab. will autodetect space and comma
            if SEP_CHAR_GEN==" "  or SEP_CHAR_GEN==",": 
                del kwargs['separator']
            else:
                kwargs['separator'] = ord(SEP_CHAR_GEN)
            
            # randomly add leading and trailing white space
            # we have to do this after we save the single char HEADER_SEP_CHAR_GEN
            if SEP_CHAR_GEN in (',', ' '):
                if random.randint(0,1):
                    SEP_CHAR_GEN = " " + SEP_CHAR_GEN
                if random.randint(0,1):
                    SEP_CHAR_GEN = SEP_CHAR_GEN + " "


            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-"
            print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-"
            print 'GZIP_DATA:', GZIP_DATA
            print 'GZIP_HEADER:', GZIP_HEADER 

            # they need to both use the same separator (h2o rule)
# can't have duplicates
            hfhList = random.sample(headerChoices, colCount) + ["output"]
            # UPDATE: always use comma or space for header separator?? it should work no matter what 
            # separator the data uses?

            headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList)
            print "headerForHeader:", headerForHeader

            
            # make these different
            # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF
            hfdList = hfhList

            headerForData   = SEP_CHAR_GEN.join(hfdList)

        
            # create data files
            for fileN in range(fileNum):
                csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv'
                csvFilename = 'syn_data_' + csvFilenameSuffix
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, 
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone
                if GZIP_DATA:
                    csvPathnamegz = csvPathname + ".gz"
                    print "gzipping to", csvPathnamegz
                    h2o_util.file_gzip(csvPathname, csvPathnamegz)
                    os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix)
                    # pattern match should find the right key with csvPathname


            # create the header file
            hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv'
            hdrFilename = 'syn_header_' + hdrFilenameSuffix
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, 
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
            # only include header file data rows if the parse pattern includes it
            if PARSE_PATTERN_INCLUDES_HEADER: 
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone
            if GZIP_HEADER:
                hdrPathnamegz = hdrPathname + ".gz"
                print "gzipping to", hdrPathnamegz
                h2o_util.file_gzip(hdrPathname, hdrPathnamegz)
                os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix)
                # pattern match should find the right key with hdrPathnameh

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w

            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True)

            h2o_cmd.runStoreView()
            headerKey = h2i.find_key(hdrFilename)
            dataKey = h2i.find_key(csvFilename)

            # use regex. the only files in the dir will be the ones we just created 
            # with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'header':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'data':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey):
                ignoreForRf = hfhList[0]
            elif DATA_HAS_HDR_ROW:
                ignoreForRf = hfdList[0]
            else:
                ignoreForRf = None

            print "If header_from_file= , required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] =  1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] =  1
            else:
                kwargs['header'] =  0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*'
            else:
                pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*'

            # don't pass to parse
            kwargs.pop('hdr_separator', None)
            parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs)
            print "parseResult['destination_key']: " + parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # more reporting: (we can error here if extra col in header, 
            # causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            h2oGainsOneData = False
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1
                
            if 1==0: # FIX! don't check for now
                self.assertEqual(inspect['numRows'], totalDataRows,
                    "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \
                    (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it

            kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf}
            start = time.time()
            # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
            h2i.delete_keys_at_all_nodes(pattern='syn_datasets')