예제 #1
0
    def test_NOPASS_create_frame_fail(self):
        h2o.beta_features = True

        for trial in range(20):
            kwargs = {'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0}

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', 
                schema='put', timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10)
            h2o_cmd.infoFromSummary(rSummary)

            print h2o.dump_json(cfResult)
    
            print "Trial #", trial, "completed"
예제 #2
0
    def test_parse_summary_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        timeoutSecs = 300

        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        parseResult = h2i.import_parse(path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList)
        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key)
        h2o_cmd.infoFromSummary(rSummary, rows=numRows, cols=numCols)

        csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip'
        validation_key = 'test.hex'
        parseResult = h2i.import_parse(path=csvPathname_test,
                                       hex_key=validation_key,
                                       timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList)

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)
예제 #3
0
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs)
            print h2o.dump_json(importResult)
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30)
            # print h2o.dump_json(storeViewResult)

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
예제 #4
0
    def test_parse_summary_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        timeoutSecs = 300

        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        parseResult  = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList)
        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)

        csvPathname_test  = importFolderPath + '/persona_clean_deep.tsv.zip'
        validation_key = 'test.hex'
        parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList)

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)
예제 #5
0
파일: test_c7_rel.py 프로젝트: brennane/h2o
    def test_c7_rel(self):
        h2o.beta_features = False
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        csvFilename = 'part-00000b'
        importFolderPath = '/mnt/0xcustomer-datasets/c2'
        csvPathname = importFolderPath + "/" + csvFilename

        # FIX! does 'separator=' take ints or ?? hex format
        # looks like it takes the hex string (two chars)
        start = time.time()
        # hardwire TAB as a separator, as opposed to white space (9)
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=True)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']

        keepPattern = "oly_|mt_|b_"
        y = "is_purchase"
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "x:", x

        kwargs = {
            'x': x,
            'y': y,
            # 'case_mode': '>',
            # 'case': 0,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 4,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600

        if DO_GLM:
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "glm completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)
예제 #6
0
    def test_NOPASS_exec2_empty_result(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=10)

        start = time.time()
        for execExpr in exprList:
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=10)
            rSummary = h2o_cmd.runSummary(key="a")
            h2o_cmd.infoFromSummary(rSummary)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", 'took', time.time(
        ) - start, 'seconds'
예제 #7
0
    def test_0_NA_2enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100,  30, '0', 'cC', 100),
            (100,  30, '0.0', 'cC', 100),
            (100,  30, '0.0000000', 'cC', 100),
            ]

        for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, zero, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename


            if DO_REBALANCE:
                print "Rebalancing it to create an artificially large # of chunks"
                rb_key = "rb_%s" % hex_key
                start = time.time()
                print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS)
                rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS)
                elapsed = time.time() - start
                print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds'
            else:
                rb_key = hex_key

            print "Now doing to_enum across all columns of %s" % hex_key
            for column_index in range(colCount):
                # is the column index 1-base in to_enum
                result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1)
                # print "\nto_enum result:", h2o.dump_json(result)
                summaryResult = h2o_cmd.runSummary(key=hex_key)
                # check that it at least is an enum column now, with no na's
                # just look at the column we touched
                column = summaryResult['summaries'][column_index]
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                stats = column['stats']
                stattype = stats['type']
                cardinality = stats['cardinality']
                if stattype != 'Enum':
                    raise Exception("column %s, which has name %s, didn't convert to Enum, is %s %s" % (column_index, colname, stattype, coltype))
                # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1
                if nacnt<=0 or nacnt>rowCount:
                    raise Exception("column %s, which has name %s, somehow got NA cnt wrong after convert to Enum  %s %s" % 
                        (column_index, colname, nacnt, rowCount))
                if cardinality!=1: # NAs don't count?
                    # print "stats:", h2o.dump_json(stats)
                    print "column:", h2o.dump_json(column)
                    raise Exception("column %s, which has name %s, should have cardinality 1, got: %s" % (column_index, colname, cardinality))
                h2o_cmd.infoFromSummary(summaryResult)
예제 #8
0
    def test_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        for trial in range(2):
            csvFilename = "syn_ints.csv"
            hex_key = "1.hex"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, trial)
            timeoutSecs = 10

            # have to import each time, because h2o deletes source after parse

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir
            # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000)
            parseResult = h2i.import_parse(parser_type=PARSER_TYPE,
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360)
            print "Inspect:", hex_key, "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o_cmd.infoFromSummary(summaryResult)

            if DO_KMEANS:
                # KMEANS******************************************
                kwargs = {
                    'k': 3,
                    'initialization': 'Furthest',
                    'ignored_cols':
                    None,  #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeans, csvPathname, parseResult, 'd', **kwargs)
예제 #9
0
파일: h2o_kmeans.py 프로젝트: Jfeng3/h2o
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        # can't use inspect on a model key? now?
        model = kmeans["model"]
        model_key = model["_key"]
        centers = model["centers"]
        cluster_variances = model["within_cluster_variances"]
        error = model["total_within_SS"]
        kmeansResult = kmeans
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        h2o.verboseprint("kmeans result:", h2o.dump_json(kmeansResult))
        model = kmeansResult["KMeansModel"]
        centers = model["clusters"]
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey
        )
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult["summaries"][0]["hcnt"]  # histogram
        rows_per_cluster = hcnt
        # FIX! does the cluster order/naming match, compared to cluster variances
        sqr_error_per_cluster = cluster_variances

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey
        )
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key)
        score = kmeansScoreResult["score"]
        rows_per_cluster = score["rows_per_cluster"]
        sqr_error_per_cluster = score["sqr_error_per_cluster"]

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
예제 #10
0
파일: h2o_kmeans.py 프로젝트: jmcclell/h2o
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        model_key = kmeans["model"]["_selfKey"]
        # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame

        # can't use inspect on a model key? now?
        kmeansResult = kmeans
        model = kmeansResult["model"]
        centers = model["clusters"]
        error = model["error"]
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        model = kmeansResult["KMeansModel"]
        centers = model["clusters"]
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey
        )
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult["summaries"][0]["hcnt"]  # histogram
        rows_per_cluster = hcnt
        # have to figure out how to get this with fvec
        sqr_error_per_cluster = [0 for h in hcnt]

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey
        )
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key)
        score = kmeansScoreResult["score"]
        rows_per_cluster = score["rows_per_cluster"]
        sqr_error_per_cluster = score["sqr_error_per_cluster"]

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", centers[i]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
    def test_parse_summary_manyfiles_s3_fvec(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirlist = [("manyfiles-nflx-gz", 800)]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            # change to 50 files
            csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(
                bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs
            )

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets",
                path=csvPathname,
                schema="s3",
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                retryDelaySecs=10,
                pollTimeoutSecs=120,
            )
            elapsed = time.time() - start
            print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
예제 #12
0
    def test_storeview_import(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
예제 #13
0
    def test_parse_summary_airline_s3(self):
        h2o.beta_features = True
        csvFilelist = [
            ("allyears2k.csv",   300), #4.4MB
            ("year1987.csv",     600), #130MB
            ("allyears.csv",     900), #12GB
            # ("allyears_10.csv", 1800), #119.98GB
        ]

        bucket = 'h2o-airlines-unpacked'
        (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3')
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        print "\nTrying StoreView after the import s3"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            # this is schema='local'k
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
예제 #14
0
    def test_parse_summary_zip_s3_fvec(self):
        h2o.beta_features = True
        csvFilelist = [
            ("test_set.zip", 300),  # 110.9MB
            ("train_set.zip", 600),  # 362.9MB
        ]

        (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets',
                                                        path="allstate",
                                                        schema='s3')

        print "\nTrying StoreView after the import hdfs"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            csvPathname = "allstate/" + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets',
                                           path=csvPathname,
                                           schema='s3',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
예제 #15
0
    def test_parse_summary_airline_s3(self):
        csvFilelist = [
            ("allyears2k.csv",   300), #4.4MB
            ("year1987.csv",     600), #130MB
            ("allyears.csv",     900), #12GB
            # ("allyears_10.csv", 1800), #119.98GB
        ]

        bucket = 'h2o-airlines-unpacked'
        (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3')
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        print "\nTrying StoreView after the import s3"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            # this is schema='local'k
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
예제 #16
0
파일: test_libsvm.py 프로젝트: Brontai/h2o
    def test_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        for trial in range(2):
            csvFilename = "syn_ints.csv"
            hex_key = "1.hex"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, trial)
            timeoutSecs = 10
        
            # have to import each time, because h2o deletes source after parse

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000)
            parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000)

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360)
            print "Inspect:", hex_key, "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o_cmd.infoFromSummary(summaryResult)

            if DO_KMEANS:
                # KMEANS******************************************
                kwargs = {
                    'k': 3, 
                    'initialization': 'Furthest',
                    'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
    def test_parse_summary_manyfiles_s3n(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs)
            s3nFullList = importHDFSResult['succeeded']
            self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
예제 #18
0
    def test_rebalance_int2enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000,  30, 'cC', 100),
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=20)
            hex_key=parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=hex_key)
            print "\n" + csvFilename


            print "Rebalancing it to create an artificially large # of chunks"
            rb_key = "rb_%s" % (hex_key)
            start = time.time()
            print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS)
            rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS)
            elapsed = time.time() - start
            print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\

            print "Now doing to_enum across all columns of %s" % hex_key
            for column_index in range(colCount):
                # is the column index 1-base in to_enum
                result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1)
                # print "\nto_enum result:", h2o.dump_json(result)
                summaryResult = h2o_cmd.runSummary(key=hex_key)
                # check that it at least is an enum column now, with no na's
                # just look at the column we touched
                column = summaryResult['summaries'][column_index]
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                stats = column['stats']
                stattype = stats['type']
                cardinality = stats['cardinality']
                if stattype != 'Enum':
                    raise Exception("column %s, which has name %s, didn't convert to Enum, is %s %s" (column_index, colname, stattype, coltype))
                if nacnt!=0:
                    raise Exception("column %s, which has name %s, somehow got NAs after convert to Enum  %s" (column_index, colname, nacnt))
                if cardinality!=4:
                    raise Exception("column %s, which has name %s,  should have cardinality 4, got: %s" (column_index, colname, cardinality))
                h2o_cmd.infoFromSummary(summaryResult)
    def test_parse_summary_manyfiles_s3n(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles-nflx-gz",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs)
            s3nFullList = importHDFSResult['succeeded']
            self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
예제 #20
0
        def do_summary_and_inspect():
            # SUMMARY******************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            coltypeList = h2o_cmd.infoFromSummary(summaryResult)

            # INSPECT******************************************
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Now check both inspect and summary
            if csvFilename=='covtype.binary.svm':
                for k in range(55):
                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0))
                    stype = inspect['cols'][k]['type']
                    print k, stype
                    self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int'))

                # summary may report type differently than inspect..check it too!
                # we could check na here too
                for i,c in enumerate(coltypeList):
                    print "column index: %s  column type: %s" % (i, c)
                    # inspect says 'int?"
                    assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c)
예제 #21
0
    def test_summary_with_x_libsvm (self):
        h2o.beta_features = True
        print "Empty rows except for the last, with all zeros for class. Single col at max"
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 100, 'cA', 300),
            (100000, 100, 'cB', 300),
            (100, 1000, 'cC', 300),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, 
                    doSummary=False)
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, 
                    timeoutSecs=timeoutSecs)
                numCols = inspect['numCols']
                numRows = inspect['numRows']

                self.assertEqual(colNumberMax+1, numCols, 
                    msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, numCols))
                self.assertEqual(rowCount, numRows, 
                    msg="generated %s rows, parsed to %s rows" % (rowCount, numRows))

                for x in range(numCols):
                    print "Doing summary with x=%s" % x
                    summaryResult = h2o_cmd.runSummary(key=hex_key, cols=x, timeoutSecs=timeoutSecs)
                    # skip the infoFromSummary check

                    colName = "C" + str(x+1)
                    print "Doing summary with col name x=%s" % colName
                    summaryResult = h2o_cmd.runSummary(key=hex_key, cols=colName, timeoutSecs=timeoutSecs)

                # do a final one with all columns for the current check below
                # FIX! we should update the check to check each individual summary result
                print "Doing and checking summary with no x=%s" % x
                summaryResult = h2o_cmd.runSummary(key=hex_key, max_ncols=colNumberMax+1, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)
예제 #22
0
    def test_NOPASS_exec2_empty_result(self):
        bucket = "smalldata"
        csvPathname = "iris/iris2.csv"
        hexKey = "i.hex"
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)

        start = time.time()
        for execExpr in exprList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            rSummary = h2o_cmd.runSummary(key="a")
            h2o_cmd.infoFromSummary(rSummary)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", "took", time.time() - start, "seconds"
예제 #23
0
    def test_NOPASS_create_frame_fail(self):
        h2o.beta_features = True

        for trial in range(20):
            kwargs = {
                'integer_range': None,
                'missing_fraction': 0.1,
                'cols': 10,
                'response_factors': 1,
                'seed': 1234,
                'randomize': 1,
                'categorical_fraction': 0,
                'rows': 1,
                'factors': 0,
                'real_range': 0,
                'value': None,
                'integer_fraction': 0
            }

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path='poker/poker1000',
                                           hex_key='temp1000.hex',
                                           schema='put',
                                           timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex',
                                                 timeoutSecs=timeoutSecs,
                                                 **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex',
                                          csvPathname=csvPathname,
                                          timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10)
            h2o_cmd.infoFromSummary(rSummary)

            print h2o.dump_json(cfResult)

            print "Trial #", trial, "completed"
예제 #24
0
    def test_NOPASS_create_frame_fail(self):
        h2o.beta_features = True

        for trial in range(20):
            kwargs = {
                "integer_range": None,
                "missing_fraction": 0.1,
                "cols": 10,
                "response_factors": 1,
                "seed": 1234,
                "randomize": 1,
                "categorical_fraction": 0,
                "rows": 1,
                "factors": 0,
                "real_range": 0,
                "value": None,
                "integer_fraction": 0,
            }

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(
                bucket="smalldata",
                path="poker/poker1000",
                hex_key="temp1000.hex",
                schema="put",
                timeoutSecs=timeoutSecs,
            )
            cfResult = h2o.nodes[0].create_frame(key="temp1000.hex", timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + "/" + "temp1000.csv"
                h2o.nodes[0].csv_download(src_key="temp1000.hex", csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key="temp1000.hex")

            rSummary = h2o_cmd.runSummary(key="temp1000.hex", cols=10)
            h2o_cmd.infoFromSummary(rSummary)

            print h2o.dump_json(cfResult)

            print "Trial #", trial, "completed"
    def test_parse_summary_zip_s3_fvec(self):
        h2o.beta_features = True
        csvFilelist = [
            ("test_set.zip",   300), # 110.9MB
            ("train_set.zip",  600), # 362.9MB
        ]

        (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path="allstate", schema='s3')

        print "\nTrying StoreView after the import hdfs"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            csvPathname = "allstate/" + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
예제 #26
0
    def test_from_import_fvec(self):
        h2o.beta_features = True

        timeoutSecs = 500
        csvFilenameAll = [
            "covtype.data",
            "covtype20x.data",
            ]

        for csvFilename in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir 
            hex_key = csvFilename + '.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local',
                hex_key=hex_key, timeoutSecs=500, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True)
            h2o_cmd.infoFromInspect(inspect, parseResult['destination_key'])

            summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
            h2o_cmd.infoFromSummary(summaryResult)

            trees = 2
            start = time.time()
            rfView = h2o_cmd.runRF(trees=trees, max_depth=20, parseResult=parseResult, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
                trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, 
                    trees, classification_error, classErrorPctList, totalScores)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
예제 #27
0
    def test_from_import(self):
        h2o.beta_features = True

        timeoutSecs = 500
        csvFilenameAll = ["covtype.data", "covtype20x.data"]

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for csvFilename in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir
            hex_key = csvFilename + ".hex"
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets",
                path="standard/" + csvFilename,
                schema="local",
                hex_key=hex_key,
                timeoutSecs=500,
                doSummary=False,
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], verbose=True)
            h2o_cmd.infoFromInspect(inspect, parseResult["destination_key"])

            summaryResult = h2o_cmd.runSummary(key=parseResult["destination_key"])
            h2o_cmd.infoFromSummary(summaryResult)

            if not h2o.beta_features:
                RFview = h2o_cmd.runRF(trees=1, depth=25, parseResult=parseResult, timeoutSecs=timeoutSecs)

            ## h2b.browseJsonHistoryAsUrlLastMatch("RFView")
            ## time.sleep(10)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
예제 #28
0
        def do_summary_and_inspect():
            # SUMMARY******************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            coltypeList = h2o_cmd.infoFromSummary(summaryResult)

            # INSPECT******************************************
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Now check both inspect and summary
            if csvFilename == 'covtype.binary.svm':
                for k in range(55):
                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0,
                                     naCnt,
                                     msg='col %s naCnt %d should be %s' %
                                     (k, naCnt, 0))
                    stype = inspect['cols'][k]['type']
                    print k, stype
                    self.assertEqual('Int',
                                     stype,
                                     msg='col %s type %s should be %s' %
                                     (k, stype, 'Int'))

                # summary may report type differently than inspect..check it too!
                # we could check na here too
                for i, c in enumerate(coltypeList):
                    print "column index: %s  column type: %s" % (i, c)
                    # inspect says 'int?"
                    assert c == 'Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (
                        i, c)
예제 #29
0
    def test_many_fp_formats_libsvm(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 30, 'sparse50'),
            (100, 10, 'cB', 30, 'sparse'),
            (100000, 100, 'cC', 30, 'sparse'),
            (1000, 10, 'cD', 30, 'sparse50'),
            (100, 100, 'cE', 30, 'sparse'),
            (100, 100, 'cF', 30, 'sparse50'),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0, NUM_CASES - 1)]:  # len(caseList)

                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel,
                                                       rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict,
                 colNumberMax) = write_syn_dataset(csvPathname, rowCount,
                                                   colCount, SEEDPERFILE, sel,
                                                   distribution)

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=selKey2,
                                               timeoutSecs=timeoutSecs)
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'])
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0, key=parseResult['destination_key'], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2,
                                                       timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(
                    colNumberMax + 1,
                    numCols,
                    msg=
                    "generated %s cols (including output).  parsed to %s cols"
                    % (colNumberMax + 1, numCols))

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                # use numCols?. numCols should be <= colCount.

                colSumList = h2e.exec_expr_list_across_cols(
                    None,
                    exprList,
                    selKey2,
                    maxCol=colNumberMax + 1,
                    timeoutSecs=timeoutSecs)

                self.assertEqual(rowCount,
                                 numRows,
                                 msg="generated %s rows, parsed to %s rows" %
                                 (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k, v in synColSumDict.iteritems():
                    if k > colNumberMax:  # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(
                        k >= 0 and k < len(colSumList),
                        msg="k: %s len(colSumList): %s numCols: %s" %
                        (k, len(colSumList), numCols))

                    syn = {}
                    if k == 0:
                        syn['name'] = "C1"
                        syn['type'] = {'Int'}
                        syn['min'] = classMin
                        syn['max'] = classMax
                        # don't check these for the col 0 'Target'
                        # syn['scale'] = {1}
                    elif k == 1:  # we forced this to always be 0
                        syn['name'] = "C2"
                        syn['type'] = {'Int'}
                        syn['min'] = 0
                        syn['max'] = 0
                        # syn['scale'] = {1}
                    else:
                        syn['name'] = "C" + str(k + 1)
                        syn['type'] = {'Int', 'Real'}
                        syn['min'] = valMin
                        syn['max'] = valMax
                        # syn['scale'] = {1,10,100,1000}

                    syn['naCnt'] = 0
                    syn['cardinality'] = -1
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect['cols'][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but
                        # we can check that it's a subset of the allowed range
                        if synKey == 'min':
                            self.assertTrue(
                                syn[synKey] <= cols[synKey],
                                msg='col %s %s %s should be <= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'max':
                            self.assertTrue(
                                syn[synKey] >= cols[synKey],
                                msg='col %s %s %s should be >= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'type':
                            if cols[synKey] not in syn[synKey]:
                                print "cols min/max:", cols['min'], cols['max']
                                print "syn min/max:", syn['min'], syn['max']
                                raise Exception(
                                    'col %s %s %s should be in this allowed %s'
                                    % (k, synKey, cols[synKey], syn[synKey]))
                        else:
                            self.assertEqual(
                                syn[synKey],
                                cols[synKey],
                                msg='col %s %s %s should be %s' %
                                (k, synKey, cols[synKey], syn[synKey]))

                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(
                        float(v),
                        colSum,
                        places=0,
                        msg='%0.6f col sum is not equal to expected %0.6f' %
                        (v, colSum))
예제 #30
0
    def test_summary(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (500000, 1, 'cD', 300, 0, 9), # expectedMin/Max must cause 10 values
            (500000, 2, 'cE', 300, 1, 10), # expectedMin/Max must cause 10 values
            (500000, 2, 'cF', 300, 2, 11), # expectedMin/Max must cause 10 values
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        for (rowCount, colCount, key2, timeoutSecs, expectedMin, expectedMax) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            legalValues = {}
            for x in range(expectedMin, expectedMax):
                legalValues[x] = x
        
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10, doSummary=False)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            summaryResult = h2o_cmd.runSummary(key=key2)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=False)
            # remove bin_names because it's too big (256?) and bins
            # just touch all the stuff returned
            summary = summaryResult['summary']
            columnsList = summary['columns']
            for columns in columnsList:
                N = columns['N']
                self.assertEqual(N, rowCount)

                name = columns['name']
                stype = columns['type']
                self.assertEqual(stype, 'number')

                histogram = columns['histogram']
                bin_size = histogram['bin_size']
                self.assertEqual(bin_size, 1)

                bin_names = histogram['bin_names']
                bins = histogram['bins']
                nbins = histogram['bins']

                for b in bins:
                    e = .1 * rowCount
                    self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, 
                        msg="Bins not right. b: %s e: %s" % (b, e))

                # not done if enum
                if stype != "enum":
                    smax = columns['max']
                    smin = columns['min']
                    percentiles = columns['percentiles']
                    thresholds = percentiles['thresholds']
                    values = percentiles['values']
                    mean = columns['mean']
                    sigma = columns['sigma']

                    self.assertEqual(smax[0], expectedMax)
                    self.assertEqual(smax[1], expectedMax-1)
                    self.assertEqual(smax[2], expectedMax-2)
                    self.assertEqual(smax[3], expectedMax-3)
                    self.assertEqual(smax[4], expectedMax-4)
                    
                    self.assertEqual(smin[0], expectedMin)
                    self.assertEqual(smin[1], expectedMin+1)
                    self.assertEqual(smin[2], expectedMin+2)
                    self.assertEqual(smin[3], expectedMin+3)
                    self.assertEqual(smin[4], expectedMin+4)

                    # apparently our 'percentile estimate" uses interpolation, so this check is not met by h2o
                    for v in values:
                        ##    self.assertIn(v,legalValues,"Value in percentile 'values' is not present in the dataset") 
                        # but: you would think it should be within the min-max range?
                        self.assertTrue(v >= expectedMin, 
                            "Percentile value %s should all be >= the min dataset value %s" % (v, expectedMin))
                        self.assertTrue(v <= expectedMax, 
                            "Percentile value %s should all be <= the max dataset value %s" % (v, expectedMax))
                
                    self.assertAlmostEqual(mean, (expectedMax+expectedMin)/2.0, delta=0.1)
                    # FIX! how do we estimate this
                    self.assertAlmostEqual(sigma, 2.9, delta=0.1)
                    
                    # since we distribute the outputs evenly from 0 to 9, we can check 
                    # that the value is equal to the threshold (within some delta

                    # is this right?
                    # if thresholds   = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]
                    # values = [   0,    0,   1,    2,    3,   5,    7,    7,   9,    9,    10]
                    eV1 = [1.0, 1.0, 1.0, 3.0, 4.0, 5.0, 7.0, 8.0, 9.0, 10.0, 10.0]
                    if expectedMin==1:
                        eV = eV1
                    elif expectedMin==0:
                        eV = [e-1 for e in eV1]
                    elif expectedMin==2:
                        eV = [e+1 for e in eV1]
                    else:
                        raise Exception("Test doesn't have the expected values for expectedMin: %s" % expectedMin)

                    for t,v,e in zip(thresholds, values, eV):
                        m = "Percentile threshold: %s with value %s should ~= %s" % (t, v, e)
                        self.assertAlmostEqual(v, e, delta=0.5, msg=m)

            trial += 1

            if (1==0): 
                generate_scipy_comparison(csvPathname)
    def test_parse_bounds_libsvm (self):
        print "Random 0/1 for col1. Last has max col = 1, All have zeros for class."
        ## h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 100, 'cA', 300),
            (100000, 100, 'cB', 300),
            (100, 100000, 'cC', 300),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

                parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', 
                    timeoutSecs=timeoutSecs, doSummary=False)
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs)
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']
                row_size = inspect['row_size']
                value_size_bytes = inspect['value_size_bytes']
                print "\n" + csvPathname, \
                    "    num_rows:", "{:,}".format(num_rows), \
                    "    num_cols:", "{:,}".format(num_cols), \
                    "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                    "    row_size:", "{:,}".format(row_size)

                expectedRowSize = num_cols * 1 # plus output
                expectedValueSize = expectedRowSize * num_rows
                self.assertEqual(row_size, expectedRowSize,
                    msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                    (row_size, expectedRowSize))
                self.assertEqual(value_size_bytes, expectedValueSize,
                    msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                    (value_size_bytes, expectedValueSize))


                # summary respects column limits
                col_limit = int(floor( 0.3 * colNumberMax ))

                # trigger an fvec conversion
                h2o.beta_features = True
                print "Do a summary2, which triggers a VA to fvec"
                summaryResult = h2o_cmd.runSummary(key=hex_key, max_ncols=col_limit, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)
                h2o.beta_features = False
                print "Go back to VA"
                # self.assertEqual(col_limit, len( summaryResult[ 'summary'][ 'columns' ] ), 
                #    "summary doesn't respect column limit of %d on %d cols" % (col_limit, colNumberMax+1))

                summaryResult = h2o_cmd.runSummary(key=hex_key, max_column_display=10*num_cols, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)
                self.assertEqual(colNumberMax+1, num_cols, 
                    msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, num_cols))
                self.assertEqual(rowCount, num_rows, 
                    msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows))

                summary = summaryResult['summary']
                columnsList = summary['columns']
                self.assertEqual(colNumberMax+1, len(columnsList), 
                    msg="generated %s cols (including output).  summary has %s columns" % (colNumberMax+1, len(columnsList)))

                for columns in columnsList:
                    N = columns['N']
                    # self.assertEqual(N, rowCount)
                    name = columns['name']
                    stype = columns['type']

                    histogram = columns['histogram']
                    bin_size = histogram['bin_size']
                    bin_names = histogram['bin_names']
                    bins = histogram['bins']
                    nbins = histogram['bins']

                    # definitely not enums
                    zeros = columns['zeros']
                    na = columns['na']
                    smax = columns['max']
                    smin = columns['min']
                    mean = columns['mean']
                    sigma = columns['sigma']

                    # a single 1 in the last col
                    if name == "V" + str(colNumberMax): # h2o puts a "V" prefix
                        synZeros = num_rows - 1
                        synSigma = None # not sure..depends on the # rows somehow (0 count vs 1 count)
                        synMean = 1.0/num_rows # why does this need to be a 1 entry list
                        synMin = [0.0, 1.0]
                        synMax = [1.0, 0.0]
                    elif name == ("V1"):
                        # can reverse-engineer the # of zeroes, since data is always 1
                        synSum = synColSumDict[1] # could get the same sum for all ccols
                        synZeros = num_rows - synSum
                        synSigma = 0.50
                        synMean = (synSum + 0.0)/num_rows
                        synMin = [0.0, 1.0]
                        synMax = [1.0, 0.0]
                    else:
                        synZeros = num_rows
                        synSigma = 0.0
                        synMean = 0.0
                        synMin = [0.0]
                        synMax = [0.0]

                    # print zeros, synZeros
                    self.assertAlmostEqual(float(mean), synMean, places=6,
                        msg='col %s mean %s is not equal to generated mean %s' % (name, mean, 0))

                    # why are min/max one-entry lists in summary result. Oh..it puts N min, N max
                    self.assertTrue(smin >= synMin,
                        msg='col %s min %s is not >= generated min %s' % (name, smin, synMin))

                    self.assertTrue(smax <= synMax,
                        msg='col %s max %s is not <= generated max %s' % (name, smax, synMax))

                    # reverse engineered the number of zeroes, knowing data was always 1 if present?
                    if name == "V65536" or name == "V65537":
                        print "columns around possible zeros mismatch:", h2o.dump_json(columns)

                    self.assertEqual(zeros, synZeros,
                        msg='col %s zeros %s is not equal to generated zeros count %s' % (name, zeros, synZeros))

                    self.assertEqual(stype, 'number',
                        msg='col %s type %s is not equal to %s' % (name, stype, 'number'))

                    # our random generation will have some variance for col 1. so just check to 2 places
                    if synSigma:
                        self.assertAlmostEqual(float(sigma), synSigma, delta=0.03,
                            msg='col %s sigma %s is not equal to generated sigma %s' % (name, sigma, synSigma))

                    self.assertEqual(0, na,
                        msg='col %s num_missing_values %d should be 0' % (name, na))
예제 #32
0
    def test_four_billion_rows(self):
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            ("four_billion_rows.csv", "a.hex"),
            ("four_billion_rows.csv", "b.hex"),
            ]
        for (csvFilename, hex_key) in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
                timeoutSecs=timeoutSecs, pollTimeoutSecs=60)
            elapsed = time.time() - start
            print "Parse result['destination_key']:", parseResult['destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            byteSize = inspect['byteSize']
            print "\n" + csvFilename, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(2, numCols,
                msg="generated %s cols (including output).  parsed to %s cols" % (2, numCols))
            self.assertEqual(4*1000000000, numRows,
                msg="generated %s rows, parsed to %s rows" % (4*1000000000, numRows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'max_iter': 20,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
                }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            # Exec to make binomial########################
            execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, 1+1, hex_key, 1+1, 0)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

            # GLM*********************************
            print "\n" + csvFilename
            colX = 0
            kwargs = {
                'response': 'C2', 
                'n_folds': 0,
                'cols': colX, 
                'alpha': 0, 
                'lambda': 0, 
                'family': 'binomial',
                # 'link' can be family_default, identity, logit, log, inverse, tweedie
            }
            # one coefficient is checked a little more

            # L2 
            timeoutSecs = 900
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
예제 #33
0
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]
        # IMPORT**********************************************
        # H2O deletes the source key. So re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        # the list could be from hdfs/s3 (ec2 remap) or local. They have to different list structures
        if 'succeeded' in importFolderResult:
            succeededList = importFolderResult['succeeded']
        elif 'files' in importFolderResult:
            succeededList = importFolderResult['files']
        else:
            raise Exception("Can't find 'files' or 'succeeded' in import list")

        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 3,
                           "Should see more than 3 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            key2 = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values,
            # to see if we have x specified well
            # figures out everything from parseKey['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(
                y=0, key=parseKey['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"

            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt",
                         "w")
                result = json.dump(storeViewResult,
                                   f,
                                   indent=4,
                                   sort_keys=True,
                                   default=str)
                f.close()
                lastStoreViewResult = storeViewResult

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
예제 #34
0
    def test_four_billion_rows(self):
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            ("four_billion_rows.csv", "a.hex"),
            ("four_billion_rows.csv", "b.hex"),
            ]
        for (csvFilename, hex_key) in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
                timeoutSecs=timeoutSecs, pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            value_size_bytes = inspect['value_size_bytes']
            row_size = inspect['row_size']
            print "\n" + csvFilename, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols), \
                "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                "    row_size:", "{:,}".format(row_size)

            expectedRowSize = num_cols * 1 # plus output
            expectedValueSize = expectedRowSize * num_rows
            self.assertEqual(row_size, expectedRowSize,
                msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                (row_size, expectedRowSize))
            self.assertEqual(value_size_bytes, expectedValueSize,
                msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                (value_size_bytes, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(2, num_cols,
                msg="generated %s cols (including output).  parsed to %s cols" % (2, num_cols))
            self.assertEqual(4*1000000000, num_rows,
                msg="generated %s rows, parsed to %s rows" % (4*1000000000, num_rows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'epsilon': 1e-6,
                'max_iter': 20,
                'cols': None,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
                }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {'x': 0, 'y': 1, 'n_folds': 0, 'case_mode': '=', 'case': 1}
            # one coefficient is checked a little more
            colX = 0

            # L2 
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
    def test_c5_KMeans_sphere_26GB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        csvFilename = 'syn_sphere15_gen_26GB.csv'
        # csvFilename = 'syn_sphere_gen_h1m.csv'
        # csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        # csvFilename = 'syn_sphere_gen_h1m_no_na.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        if NA_COL_BUG:
            expected = [
                # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB,
                # so shouldn't be used for 26GB
                # or it should be divided by 7
                # the distribution is the same, obviously.
                ([
                    -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084,
                    31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915,
                    -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    19.00092954923767, -10.999565572612255, 90.00028669073289,
                    1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981,
                    30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736,
                    16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317,
                    -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165,
                    11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991,
                    -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    147.00394564757505, 122.98729664236723, 311.0047920137008,
                    2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]
        else:
            expected = [
                ([
                    0.0, -113.00566692375459, -89.99595447985321,
                    -455.9970643424373, 4732.0, 49791778.0, 36800.0
                ], 248846122, 1308149283316.2988),
                ([
                    0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412,
                    25654042.00592703, 28304.0
                ], 276924291, 1800760152555.98),
                ([
                    0.0, 5.0, 2.0, 340.0, 1817.995920197288,
                    33970406.992053084, 31319.99486705394
                ], 235089554, 375419158808.3253),
                ([
                    0.0, 10.0, -72.00113070337981, -171.0198611715457,
                    4430.00952228909, 37007399.0, 29894.0
                ], 166180630, 525423632323.6474),
                ([
                    0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604,
                    22865824.99639042, 5335.0
                ], 167234179, 1845362026223.1094),
                ([
                    0.0, 12.0, 3.0, 168.0, -4066.995950679284,
                    41077063.00269915, -47537.998050740985
                ], 195420925, 197941282992.43475),
                ([
                    0.0, 19.00092954923767, -10.999565572612255,
                    90.00028669073289, 1928.0, 39967190.0, 27202.0
                ], 214401768, 11868360232.658035),
                ([
                    0.0, 20.0, 0.0, 141.0, -3263.0030236302937,
                    6163210.990273981, 30712.99115201907
                ], 258853406, 598863991074.3276),
                ([
                    0.0, 21.0, 114.01584574295777, 242.99690338815898,
                    1674.0029079209912, 33089556.0, 36415.0
                ], 190979054, 1505088759456.314),
                ([
                    0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022,
                    -48473733.04122273, 47343.0
                ], 87794427, 1124697008162.3955),
                ([
                    0.0, 39.0, 3.0, 470.0, -3337.9880599007597,
                    28768057.98852736, 16716.003410920028
                ], 78226988, 1151439441529.0215),
                ([
                    0.0, 40.0, 1.0, 145.0, 950.9990795199593,
                    14602680.991458317, -14930.007919032574
                ], 167273589, 693036940951.0249),
                ([
                    0.0, 42.0, 4.0, 479.0, -3678.0033024834297,
                    8209673.001421165, 11767.998552236539
                ], 148426180, 35942838893.32379),
                ([
                    0.0, 48.0, 4.0, 71.0, -951.0035145455234,
                    49882273.00063991, -23336.998167498707
                ], 157533313, 88431531357.62982),
                ([
                    0.0, 147.00394564757505, 122.98729664236723,
                    311.0047920137008, 2320.0, 46602185.0, 11212.0
                ], 118361306, 1111537045743.7646),
            ]

        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='hdfs',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema='local',
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse',
                csvPathname, fileMBS, elapsed)
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'],
                                         timeoutSecs=300)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summary = h2o_cmd.runSummary(key=parseResult['destination_key'],
                                         numRows=numRows,
                                         numCols=numCols,
                                         timeoutSecs=300)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15,
                'max_iter': 500,
                # 'normalize': 1,
                'normalize': 0,  # temp try
                'initialization': 'Furthest',
                'destination_key': 'junk.hex',
                # we get NaNs if whole col is NA
                'ignored_cols': 'C1',
                'normalize': 0,
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial % 3) == 1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             benchmarkLogging=benchmarkLogging,
                                             **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeansResult)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans",
                "trial " + str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            # his does predict
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeansResult, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            # these clusters were sorted compared to the cluster order in training
            h2o_kmeans.showClusterDistribution(self,
                                               tupleResultList,
                                               expected,
                                               trial=trial)
            # why is the expected # of rows not right in KMeans2. That means predictions are wrong
            h2o_kmeans.compareResultsToExpected(self,
                                                tupleResultList,
                                                expected,
                                                allowedDelta,
                                                allowError=False,
                                                allowRowError=True,
                                                trial=trial)

            # the tupleResultList has the size during predict? compare it to the sizes during training
            # I assume they're in the same order.
            model = kmeansResult['model']
            size = model['size']
            size2 = [t[1] for t in tupleResultList]

            if 1 == 1:  # debug
                print "training size:", size
                print "predict size2:", size2
                print "training sorted(size):", sorted(size)
                print "predict sorted(size2):", sorted(size2)
                print h2o.nodes[0].http_addr
                print h2o.nodes[0].port

            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            print "iterations", iterations

            if iterations >= (
                    max_iter -
                    1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (trial, iterations, max_iter))

            # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure

            # can't do this compare, because size2 is sorted by center order..
            # so we don't know how to reorder size the same way
            # we could just sort the two of them, for some bit of comparison.
            if sorted(size) != sorted(size2):
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as predict on same data: %s"
                    % (trial, size, size2))

            # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram
            expectedSize = [t[1] / SCALE_SIZE for t in expected]

            if size2 != expectedSize:
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as expected: %s"
                    % (trial, size, expectedSize))

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
예제 #36
0
    def test_four_billion_rows_fvec(self):
        h2o.beta_features = True
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            "four_billion_rows.csv",
            ]
        for csvFilename in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                timeoutSecs=timeoutSecs, pollTimeoutSecs=180)
            elapsed = time.time() - start
            print "Parse result['destination_key']:", parseResult['destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            byteSize = inspect['byteSize']
            print "\n" + csvFilename, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols), \
                "    byteSize:", "{:,}".format(byteSize)

            expectedRowSize = numCols * 1 # plus output
            # expectedValueSize = expectedRowSize * numRows
            expectedValueSize =  8001271520
            self.assertEqual(byteSize, expectedValueSize,
                msg='byteSize %s is not expected: %s' % \
                (byteSize, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(2, numCols,
                msg="generated %s cols (including output).  parsed to %s cols" % (2, numCols))
            self.assertEqual(4*1000000000, numRows,
                msg="generated %s rows, parsed to %s rows" % (4*1000000000, numRows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'max_iter': 4,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
                }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'response': 'C1',
                'n_folds': 0, 
                'family': 'binomial',
            }
            # one coefficient is checked a little more
            colX = 1

            # convert to binomial
            execExpr="A.hex=%s" % parseResult['destination_key']
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
            execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % ('C1', 'C1', 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
            aHack = {'destination_key': "A.hex"}

            # L2 
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
예제 #37
0
    def test_parse_bounds_libsvm (self):
        print "Empty rows except for the last, with all zeros for class. Single col at max"
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 100, 'cA', 300),
            (100000, 100, 'cB', 300),
            (100, 10000, 'cC', 300),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

                parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=timeoutSecs, doSummary=False)
                print "Parse result['destination_key']:", parseKey['destination_key']
                inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs)
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']

                self.assertEqual(colNumberMax+1, num_cols, 
                    msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, num_cols))
                self.assertEqual(rowCount, num_rows, 
                    msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows))

                # just want to see if we stack trace on these
                for x in range(num_cols):
                    print "Doing summary with x=%s" % x
                    summaryResult = h2o_cmd.runSummary(key=key2, x=x, timeoutSecs=timeoutSecs)
                    # skip the infoFromSummary check

                    if x==0:
                        colName = "Target"
                    else:
                        colName = "V" + str(x)
                    print "Doing summary with col name x=%s" % colName
                    summaryResult = h2o_cmd.runSummary(key=key2, x=x, timeoutSecs=timeoutSecs)
                    # skip the infoFromSummary check


                # do a final one with all columns for the current check below
                # FIX! we should update the check to check each individual summary result
                print "Doing and checking summary with no x=%s" % x
                summaryResult = h2o_cmd.runSummary(key=key2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                summary = summaryResult['summary']
                columnsList = summary['columns']
                self.assertEqual(colNumberMax+1, len(columnsList), 
                    msg="generated %s cols (including output).  summary has %s columns" % (colNumberMax+1, len(columnsList)))

                for columns in columnsList:
                    N = columns['N']
                    # self.assertEqual(N, rowCount)
                    name = columns['name']
                    stype = columns['type']

                    histogram = columns['histogram']
                    bin_size = histogram['bin_size']
                    bin_names = histogram['bin_names']
                    bins = histogram['bins']
                    nbins = histogram['bins']

                    # definitely not enums
                    zeros = columns['zeros']
                    na = columns['na']
                    smax = columns['max']
                    smin = columns['min']
                    mean = columns['mean']
                    sigma = columns['sigma']

                    # a single 1 in the last col
                    # print name
                    if name == ("V" + str(colNumberMax)): # h2o puts a "V" prefix
                        synMean = 1.0/num_rows # why does this need to be a 1 entry list
                        synMin = [0.0, 1.0]
                        synMax = [1.0, 0.0]
                    else:
                        synMean = 0.0
                        synMin = [0.0]
                        synMax = [0.0]

                    self.assertEqual(float(mean), synMean,
                        msg='col %s mean %s is not equal to generated mean %s' % (name, mean, 0))

                    # why are min/max one-entry lists in summary result. Oh..it puts N min, N max
                    self.assertEqual(smin, synMin,

                        msg='col %s min %s is not equal to generated min %s' % (name, smin, synMin))

                    self.assertEqual(smax, synMax,
                        msg='col %s max %s is not equal to generated max %s' % (name, smax, synMax))

                    self.assertEqual(0, na,
                        msg='col %s num_missing_values %d should be 0' % (name, na))
    def test_many_fp_formats_libsvm (self):
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 30, 'sparse50'),
            (100, 10, 'cB', 30, 'sparse'),
            (100000, 100, 'cC', 30, 'sparse'),
            (1000, 10, 'cD', 30, 'sparse50'),
            (100, 100, 'cE', 30,'sparse'),
            (100, 100, 'cF', 30,'sparse50'),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList:
            # for sel in range(48): # len(caseList)
            for sel in [random.randint(0,47)]: # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict, colNumberMax)  = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution)

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs)
                print csvFilename, 'parse time:', parseResult['response']['time']
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values, 
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0,
                    key=parseResult['destination_key'], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)


                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(colNumberMax+1, num_cols, 
                    msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, num_cols))

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                # use num_cols?. num_cols should be <= colCount. 

                colSumList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
                    timeoutSecs=timeoutSecs)

                self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k,v in synColSumDict.iteritems():
                    if k > colNumberMax: # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(k>=0 and k<len(colSumList), msg="k: %s len(colSumList): %s num_cols: %s" % (k, len(colSumList), num_cols))

                    syn = {}
                    if k==0: 
                        syn['name'] = "Target"
                        syn['size'] = {1,2} # can be two if we actually used the full range 0-255 (need extra for h2o NA)
                        syn['type'] = {'int'}
                        syn['min'] = classMin
                        syn['max'] = classMax
                        # don't check these for the col 0 'Target'
                        syn['scale'] = {1}
                        # syn['base'] = 0
                        # syn['variance'] = 0
                    elif k==1: # we forced this to always be 0
                        syn['name'] = "V" + str(k)
                        syn['size'] = {1}
                        syn['type'] = {'int'}
                        syn['min'] = 0
                        syn['max'] = 0
                        syn['scale'] = {1}
                        syn['base'] = 0
                        syn['variance'] = 0
                    else:
                        syn['name'] = "V" + str(k)
                        syn['size'] = {1,2,4,8} # can be 2, 4 or 8? maybe make this a set for membership check
                        syn['type'] = {'int', 'float'}
                        syn['min'] = valMin
                        syn['max'] = valMax
                        syn['scale'] = {1,10,100,1000}
                        # syn['base'] = 0
                        # syn['variance'] = 0

                    syn['num_missing_values'] = 0
                    syn['enum_domain_size'] = 0
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect['cols'][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but 
                        # we can check that it's a subset of the allowed range
                        if synKey == 'min':
                            self.assertTrue(syn[synKey] <= cols[synKey],
                                msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'max':
                            self.assertTrue(syn[synKey] >= cols[synKey],
                                msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'size' or synKey == 'scale' or synKey == 'type':
                            if cols[synKey] not in syn[synKey]:
                                # for debug of why it was a bad size
                                print "cols size/min/max:", cols['size'], cols['min'], cols['max']
                                print "syn size/min/max:", syn['size'], syn['min'], syn['max']
                                raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey]))
                        else:
                            self.assertEqual(syn[synKey], cols[synKey],
                                msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey]))
                    
                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(float(v), colSum, places=0, 
                        msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
예제 #39
0
    def test_c5_KMeans_sphere_67MB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        csvFilename = 'syn_sphere_gen_h1m_no_na.csv'
        totalBytes = 67306997
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            else:
                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # clear out all NAs (walk across cols)..clear to 0
            # temp
            ## execExpr = '%s=apply(%s,2,function(x){ifelse(is.na(x),0,x)})' % (hex_key, hex_key)
            ## h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            summary = h2o_cmd.runSummary(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15, 
                'max_iter': 10,
                'normalize': 1,
                'initialization': 'Furthest',
                'destination_key': 'junk.hex', 
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
                # 'ignored_cols': 'C0', # get NaNs if col with all NAs is left in. the exec2 clear doesn't seem to work
                }

            if (trial%3)==0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial%3)==1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeans)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
            h2i.delete_keys_at_all_nodes()
    def test_create_rebalance_2enum(self):
        # default
        params = {'rows': 100, 'cols': 1}
        for trial in range(20):
            # CREATE FRAME params################################################################
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)
            # h2o does some strict checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None

            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0

            # CREATE FRAME*****************************************************
            kwargs = params.copy()
            print kwargs
            timeoutSecs = 300
            hex_key = 'temp1000.hex'
            cfResult = h2o.nodes[0].create_frame(key=hex_key,
                                                 timeoutSecs=timeoutSecs,
                                                 **kwargs)

            # REBALANCE*****************************************************
            print "Rebalancing it to create an artificially large # of chunks"
            rb_key = "rb_%s" % (hex_key)
            start = time.time()
            print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key,
                                                           REBALANCE_CHUNKS)
            SEEDPERFILE = random.randint(0, sys.maxint)
            rebalanceResult = h2o.nodes[0].rebalance(source=hex_key,
                                                     after=rb_key,
                                                     chunks=REBALANCE_CHUNKS)
            elapsed = time.time() - start
            print "rebalance end on ", hex_key, 'to', rb_key, 'took', elapsed, 'seconds',\

            # TO ENUM*****************************************************
            print "Now doing to_enum across all columns of %s" % rb_key
            for column_index in range(params['cols']):
                # is the column index 1-base in to_enum
                result = h2o.nodes[0].to_enum(None,
                                              src_key=rb_key,
                                              column_index=column_index + 1)
                # print "\nto_enum result:", h2o.dump_json(result)
                summaryResult = h2o_cmd.runSummary(key=hex_key)
                # check that it at least is an enum column now, with no na's
                # just look at the column we touched
                column = summaryResult['summaries'][column_index]
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                stats = column['stats']
                stattype = stats['type']

                # we have some # of na's in the columns...but there should not be 100% NA
                if nacnt >= params['rows']:
                    raise Exception(
                        "column %s, which has name '%s', somehow too many NAs after convert to Enum  %s %s"
                        % (column_index, colname, nacnt, params['rows']))

                print "I suspect that columns that are constant, maybe with NAs also, don't convert to Enum"
                if stattype != 'Enum':
                    raise Exception(
                        "column %s, which has name '%s', didn't convert to Enum, is %s %s %s"
                        % (column_index, colname, stattype, coltype,
                           h2o.dump_json(column)))

                cardinality = stats['cardinality']
                # don't know the cardinality expected
                # if cardinality!=4:
                #     raise Exception("column %s, which has name '%s',  should have cardinality 4, got: %s" %
                #         (column_index, colname, cardinality))

                h2o_cmd.infoFromSummary(summaryResult)

            print "Trial #", trial, "completed"
예제 #41
0
    def test_four_billion_rows_fvec(self):
        h2o.beta_features = True
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            "four_billion_rows.csv",
        ]
        for csvFilename in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=180,
                                           retryDelaySecs=3)
            elapsed = time.time() - start
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            byteSize = inspect['byteSize']
            print "\n" + csvFilename, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols), \
                "    byteSize:", "{:,}".format(byteSize)

            expectedRowSize = numCols * 1  # plus output
            # expectedValueSize = expectedRowSize * numRows
            expectedValueSize = 8001271520
            self.assertEqual(byteSize, expectedValueSize,
                msg='byteSize %s is not expected: %s' % \
                (byteSize, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(
                key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                numCols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, numCols))
            self.assertEqual(4 * 1000000000,
                             numRows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, numRows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'max_iter': 10,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       retryDelaySecs=4,
                                       **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'response': 'C1',
                'n_folds': 0,
                'family': 'binomial',
            }
            # one coefficient is checked a little more
            colX = 1

            # convert to binomial
            execExpr = "A.hex=%s" % parseResult['destination_key']
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
            execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % ('1', '1', 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
            aHack = {'destination_key': "A.hex"}

            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
예제 #42
0
    def test_exec_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 3, 2, 'cE', 300), 
            ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            for j in range(CUT_EXPR_CNT):
                print "Creating", CUT_EXPR_CNT, 'cut expressions'
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount), random.randint(1,iColCount))
                for c in cols:
                    # possible choices within the column
                    # cel = colEnumList[c]
                    cel = colEnumList
                    # for now the cutValues are numbers for the enum mappings
                    if 1==1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice
    
                cutExprList = []
                for i,c in enumerate(cutValue):
                    if c is None:   
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        cutExprList.append('p$C'+str(i+1)+'=='+c)

                cutExpr = ' && '.join(cutExprList)
                print "cutExpr:", cutExpr    

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

                print "j:", j

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList)

            # PARSE*******************************************************

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            # print h2o.dump_json(inspect)

            rSummary = h2o_cmd.runSummary(key=parseResult['destination_key'])
            h2o_cmd.infoFromSummary(rSummary)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1==1:
                a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False)
                    ## print h2o.dump_json(e)


            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0,iColCount-1)
                randOCol = random.randint(iColCount, iColCount+oColCount-1)

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                if 1==0:
                    start = time.time()
                    e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1))

                    elapsed = time.time() - start
                    print "exec 1 took", elapsed, "seconds."
                    execTime = elapsed

                if 1==1:
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList)))
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."
                
                if 1==0:
                    gKey = random.choice(eKeys)
                    # do a 2nd random to see if things blow up
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey))
                    elapsed = time.time() - start
                    print "exec 3 took", elapsed, "seconds."

                if 1==1:
                    inspect = h2o_cmd.runInspect(key=fKey)
                    h2o_cmd.infoFromInspect(inspect, fKey)
                    numRows = inspect['numRows']
                    numCols = inspect['numCols']

                if numRows==0 or numCols!=colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey, column=column, 
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed


                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)


        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
예제 #43
0
    def test_impute_with_na(self):
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       schema='local',
                                       timeoutSecs=20)

        print "Just insert some NAs and see what happens"
        inspect = h2o_cmd.runInspect(key=hex_key)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        missing_fraction = 0.1

        # NOT ALLOWED TO SET AN ENUM COL?
        if 1 == 0:
            # since insert missing values (below) doesn't insert NA into enum rows, make it NA with exec?
            # just one in row 1
            for enumCol in enumColList:
                print "hack: Putting NA in row 0 of col %s" % enumCol
                execExpr = '%s[1, %s+1] = NA' % (hex_key, enumCol)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList after exec:", missingValuesList
            if len(missingValuesList) != len(enumColList):
                raise Exception(
                    "Didn't get missing values in expected number of cols: %s %s"
                    % (enumColList, missingValuesList))

        for trial in range(5):
            # copy the dataset
            hex_key2 = 'c.hex'
            execExpr = '%s = %s' % (hex_key2, hex_key)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            imvResult = h2o.nodes[0].insert_missing_values(
                key=hex_key2, missing_fraction=missing_fraction, seed=SEED)
            print "imvResult", h2o.dump_json(imvResult)
            # maybe make the output col a factor column
            # maybe one of the 0,1 cols too?
            # java.lang.IllegalArgumentException: Method `mode` only applicable to factor columns.
            # ugh. ToEnum2 and ToInt2 take 1-based column indexing. This should really change back to 0 based for h2o-dev? (like Exec3)

            print "Doing the ToEnum2 AFTER the NA injection, because h2o doesn't work right if we do it before"
            expectedMissing = missing_fraction * origNumRows  # per col
            enumColList = [49, 50, 51, 52, 53, 54]
            for e in enumColList:
                enumResult = h2o.nodes[0].to_enum(src_key=hex_key2,
                                                  column_index=(e + 1))

            inspect = h2o_cmd.runInspect(key=hex_key2)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(origNumRows, numRows)
            self.assertEqual(origNumCols, numCols)

            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList", missingValuesList
            if len(missingValuesList) != numCols:
                raise Exception(
                    "Why is missingValuesList not right afer ToEnum2?: %s %s" %
                    (enumColList, missingValuesList))

            for mv in missingValuesList:
                self.assertAlmostEqual(mv,
                                       expectedMissing,
                                       delta=0.1 * mv,
                                       msg='mv %s is not approx. expected %s' %
                                       (mv, expectedMissing))

            summaryResult = h2o_cmd.runSummary(key=hex_key2)
            h2o_cmd.infoFromSummary(summaryResult)
            # h2o_cmd.infoFromSummary(summaryResult)

            print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect"
            print "trial", trial
            print "expectedMissing:", expectedMissing

            print "Now get rid of all the missing values, but imputing means. We know all columns should have NAs from above"
            print "Do the columns in random order"

            # don't do the enum cols ..impute doesn't support right?
            if AVOID_BUG:
                shuffledColList = range(0, 49)  # 0 to 48
                execExpr = '%s = %s[,1:49]' % (hex_key2, hex_key2)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)
                # summaryResult = h2o_cmd.runSummary(key=hex_key2)
                # h2o_cmd.infoFromSummary(summaryResult)
                inspect = h2o_cmd.runInspect(key=hex_key2)
                numCols = inspect['numCols']
                missingValuesList = h2o_cmd.infoFromInspect(inspect)
                print "missingValuesList after impute:", missingValuesList
                if len(missingValuesList) != 49:
                    raise Exception(
                        "expected missing values in all cols after pruning enum cols: %s"
                        % missingValuesList)
            else:
                shuffledColList = range(0, 55)  # 0 to 54

            origInspect = inspect
            random.shuffle(shuffledColList)

            for column in shuffledColList:
                # get a random set of column. no duplicate. random order? 0 is okay? will be []
                groupBy = random.sample(range(55), random.randint(0, 54))
                # header names start with 1, not 0. Empty string if []
                groupByNames = ",".join(
                    map(lambda x: "C" + str(x + 1), groupBy))

                # what happens if column and groupByNames overlap?? Do we loop here and choose until no overlap
                columnName = "C%s" % (column + 1)
                print "don't use mode if col isn't enum"
                badChoices = True
                while badChoices:
                    method = random.choice(["mean", "median", "mode"])
                    badChoices = column not in enumColList and method == "mode"

                NEWSEED = random.randint(0, sys.maxint)
                print "does impute modify the source key?"
                # we get h2o error (argument exception) if no NAs
                impResult = h2o.nodes[0].impute(source=hex_key2,
                                                column=column,
                                                method=method)

            print "Now check that there are no missing values"
            print "FIX! broken..insert missing values doesn't insert NAs in enum cols"

            inspect = h2o_cmd.runInspect(key=hex_key2)
            numRows2 = inspect['numRows']
            numCols2 = inspect['numCols']
            self.assertEqual(
                numRows, numRows2,
                "imput shouldn't have changed frame numRows: %s %s" %
                (numRows, numRows2))

            self.assertEqual(
                numCols, numCols2,
                "imput shouldn't have changed frame numCols: %s %s" %
                (numCols, numCols2))

            # check that the mean didn't change for the col
            # the enum cols with mode, we'll have to think of something else

            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList after impute:", missingValuesList
            if missingValuesList:
                raise Exception(
                    "Not expecting any missing values after imputing all cols: %s"
                    % missingValuesList)

            cols = inspect['cols']
            origCols = origInspect['cols']
            for i, (c, oc) in enumerate(zip(cols, origCols)):
                # I suppose since we impute to either median or mean, we can't assume the mean stays the same
                # but for this tolerance it's okay (maybe a different dataset, that wouldn't be true
                h2o_util.approxEqual(
                    c['mean'],
                    oc['mean'],
                    tol=0.000000001,
                    msg=
                    "col %i original mean: %s not equal to mean after impute: %s"
                    % (i, c['mean'], oc['mean']))
예제 #44
0
    def test_exec2_na2mean(self):
        h2o.beta_features = True
        print "https://0xdata.atlassian.net/browse/PUB-228"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            'rcnt = c(0)',
            'total = c(0)',
            'mean = c(0)',
            's.hex = r.hex',
            "x=r.hex[,1]; rcnt=nrow(x)-sum(is.na(x))",
            "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x))",
            "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x)); mean=total / rcnt",
            "x=r.hex[,1]; total=sum(ifelse(is.na(x),0,x)); rcnt=nrow(x)-sum(is.na(x)); mean=total / rcnt; x=ifelse(is.na(x),mean,x)",
        ]

        execExprList2 = [
            "s.hex = apply(r.hex,2," +
                "function(x){total=sum(ifelse(is.na(x),0,x)); " + \
                "rcnt=nrow(x)-sum(is.na(x)); " + \
                "mean=total / rcnt; " + \
                "ifelse(is.na(x),mean,x)} " + \
            ")" ,
            # this got an exception. note I forgot to assign to x here
            "s=r.hex[,1]; s.hex[,1]=ifelse(is.na(x),0,x)",
            # throw in a na flush to 0
            "x=r.hex[,1]; s.hex[,1]=ifelse(is.na(x),0,x)",
        ]
        execExprList += execExprList2

        results = []
        for execExpr in execExprList:
            start = time.time()
            (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # unneeded but interesting 
            results.append(result)
            print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(resultExec)
            h2o.check_sandbox_for_errors()

        # compare it to summary
        rSummary = h2o_cmd.runSummary(key='r.hex', cols='0')
        h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s.hex', cols='0')
        h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        print "Comparing summary of r.hex to summary of s.hex"
        df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
        # time can be different
        print "df.difference:", h2o.dump_json(df.difference)
        self.assertLess(len(df.difference), 2)
    

        print "results from the individual exec expresssions (ignore last which was an apply)"
        print "results:", results
        self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0, 1859.0, 1859.0])
예제 #45
0
    def test_c10_glm_fvec(self):
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        csvFilename = 'classification1Train.txt'
        csvPathname = importFolderPath + "/" + csvFilename

        start = time.time()

        # hack. force it to NA the header, so we have col names that are not customer senstive below
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='local',
                                       timeoutSecs=500,
                                       doSummary=False,
                                       header=0)
        print "Parse of", parseResult['destination_key'], "took", time.time(
        ) - start, "seconds"

        print "Parse result['destination_key']:", parseResult[
            'destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None,
                                     parseResult['destination_key'],
                                     timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time(
        ) - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        numRows = inspect['numRows']
        numCols = inspect['numCols']

        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # keepList = []
        # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList)
        # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices

        y = 0
        ignore_x = []
        x = [
            6, 7, 8, 10, 12, 31, 32, 33, 34, 35, 36, 37, 40, 41, 42, 43, 44,
            45, 46, 47, 49, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
            66, 67, 68, 69, 70
        ]
        for i in range(numCols):
            if i not in x and i != y:
                ignore_x.append(i)

        # since we're no long zero based, increment by 1
        ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

        # GLM Train***********************************************************
        keepPattern = None
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y,
                                        keepPattern=keepPattern,
                                        key=parseResult['destination_key'],
                                        timeoutSecs=300)
        print "x:", x
        print "ignore_x:", x

        kwargs = {
            'response': y,
            'ignored_cols': ignore_x,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 10,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
        }

        timeoutSecs = 3600
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult,
                             timeoutSecs=timeoutSecs,
                             pollTimeoutSecs=60,
                             **kwargs)
        elapsed = time.time() - start
        print "glm completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        # Parse Test***********************************************************
        GLMModel = glm['glm_model']
        modelKey = GLMModel['_key']

        csvFilename = 'classification1Test.txt'
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='local',
                                       timeoutSecs=500,
                                       doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time(
        ) - start, "seconds"
예제 #46
0
    def test_parse_time(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_time.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        colCount = 6
        rowCount = 10
        headerData = rand_header(colCount)
        write_syn_dataset(csvPathname, rowCount, colCount, headerData)

        for trial in range(1):
            rowData = rand_rowData()
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsA = inspect['numRows']
            numColsA = inspect['numCols']

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               timeoutSecs=100,
                                               numCols=numColsA,
                                               numRows=numRowsA,
                                               noPrint=True)

            print summaryResult
            h2o_cmd.infoFromSummary(summaryResult)
            (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)

            if constantValuesDictA or enumSizeDictA:
                raise Exception(
                    "Should be empty?  constantValuesDictA %s enumSizeDictA %s"
                    % (constantValuesDictA, enumSizeDictA))

            print "missingValuesListA", missingValuesListA

            # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty")
            self.assertEqual(numColsA, colCount)
            self.assertEqual(numRowsA, rowCount)

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(src_key=hex_key,
                                      csvPathname=csvDownloadPathname)

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            # interesting. what happens when we do csv download with time data?
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsB = inspect['numRows']
            numColsB = inspect['numCols']
            print "missingValuesListB", missingValuesListB
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               timeoutSecs=100,
                                               numCols=numColsB,
                                               numRows=numRowsB,
                                               noPrint=True)
            (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)
            if constantValuesDictB or enumSizeDictB:
                raise Exception(
                    "Should be empty?  constantValuesDictB %s enumSizeDictB %s"
                    % (constantValuesDictB, enumSizeDictB))

            self.assertEqual(
                missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result"
            )
            self.assertEqual(
                numColsA, numColsB,
                "numCols mismatches after re-parse of downloadCsv result")
            # H2O adds a header to the csv created. It puts quotes around the col numbers if no header
            # but in this dataset we have a header too, so the row counts should be equal
            # if not, maybe the parse of our dataset didn't detect a row
            self.assertEqual(
                numRowsA, numRowsB,
                "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result"
                % (numRowsA, numRowsB))

            # FIX! should do some comparison of values?
            # maybe can use exec to checksum the columns and compare column list.
            # or compare to expected values? (what are the expected values for the number for time inside h2o?)

            # FIX! should compare the results of the two parses. The infoFromInspect result?
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
    def test_many_fp_formats_libsvm_2 (self):
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cA', 300, 'sparse50'),
            (100, 10000, 'cB', 300, 'sparse'),
            (100, 40000, 'cC', 300, 'sparse50'),
            (100, 40000, 'cD', 300, 'sparse'),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs, distribution) in tryList:
            # for sel in range(48): # len(caseList)
            for sel in [random.randint(0,47)]: # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution)

                selKey2 = key2 + "_" + str(sel)
                parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs, doSummary=False)
                print csvFilename, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']
                inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs)
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values, 
                # to see if we have x specified well
                # figures out everything from parseKey['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0,
                    key=parseKey['destination_key'], timeoutSecs=300, noPrint=True)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=timeoutSecs)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, num_cols))

                # Exec (column sums)*************************************************
                if DO_COMPARE_SUM:
                    h2e.exec_zero_list(zeroList)
                    colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
                        timeoutSecs=timeoutSecs)
                    print "\n*************"
                    print "colResultList", colResultList
                    print "*************"

                self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                ### print "\nsynColSumDict:", synColSumDict

                for k,v in synColSumDict.iteritems():
                    if DO_COMPARE_SUM:
                        # k should be integers that match the number of cols
                        self.assertTrue(k>=0 and k<len(colResultList))
                        compare = colResultList[k]
                        print "\nComparing col sums:", v, compare
                        # Even though we're comparing floating point sums, the operations probably should have
                        # been done in same order, so maybe the comparison can be exact (or not!)
                        self.assertAlmostEqual(v, compare, places=0, 
                            msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare))

                    synMean = (v + 0.0)/rowCount
                    # enums don't have mean, but we're not enums
                    mean = inspect['cols'][k]['mean']
                    # our fp formats in the syn generation sometimes only have two places?
                    self.assertAlmostEqual(mean, synMean, places=0,
                        msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))

                    num_missing_values = inspect['cols'][k]['num_missing_values']
                    self.assertEqual(0, num_missing_values,
                        msg='col %s num_missing_values %d should be 0' % (k, num_missing_values))
    def test_c5_KMeans_sphere_26GB_fvec(self):
        # a kludge
        h2o.setup_benchmark_log()

        # csvFilename = 'syn_sphere15_2711545732row_6col_180GB_from_7x.csv'
        csvFilename = "syn_sphere15_gen_26GB.csv"
        # csvFilename = 'syn_sphere_gen_h1m.csv'
        # csvFilename = 'syn_sphere_gen_real_1.49M.csv'
        # csvFilename = 'syn_sphere_gen_h1m_no_na.csv'

        totalBytes = 183538602156
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + "/" + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        if NA_COL_BUG:
            expected = [
                # the centers are the same for the 26GB and 180GB. The # of rows is right for 180GB,
                # so shouldn't be used for 26GB
                # or it should be divided by 7
                # the distribution is the same, obviously.
                (
                    [-113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                    248846122,
                    1308149283316.2988,
                ),
                (
                    [1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                    276924291,
                    1800760152555.98,
                ),
                (
                    [5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                    235089554,
                    375419158808.3253,
                ),
                (
                    [10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                    166180630,
                    525423632323.6474,
                ),
                (
                    [11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                    167234179,
                    1845362026223.1094,
                ),
                (
                    [12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                    195420925,
                    197941282992.43475,
                ),
                (
                    [19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                    214401768,
                    11868360232.658035,
                ),
                (
                    [20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                    258853406,
                    598863991074.3276,
                ),
                (
                    [21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                    190979054,
                    1505088759456.314,
                ),
                (
                    [25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                    87794427,
                    1124697008162.3955,
                ),
                (
                    [39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                    78226988,
                    1151439441529.0215,
                ),
                (
                    [40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                    167273589,
                    693036940951.0249,
                ),
                (
                    [42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                    148426180,
                    35942838893.32379,
                ),
                (
                    [48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                    157533313,
                    88431531357.62982,
                ),
                (
                    [147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                    118361306,
                    1111537045743.7646,
                ),
            ]
        else:
            expected = [
                (
                    [0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0],
                    248846122,
                    1308149283316.2988,
                ),
                (
                    [0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0],
                    276924291,
                    1800760152555.98,
                ),
                (
                    [0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394],
                    235089554,
                    375419158808.3253,
                ),
                (
                    [0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0],
                    166180630,
                    525423632323.6474,
                ),
                (
                    [0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0],
                    167234179,
                    1845362026223.1094,
                ),
                (
                    [0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985],
                    195420925,
                    197941282992.43475,
                ),
                (
                    [0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0],
                    214401768,
                    11868360232.658035,
                ),
                (
                    [0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907],
                    258853406,
                    598863991074.3276,
                ),
                (
                    [0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0],
                    190979054,
                    1505088759456.314,
                ),
                (
                    [0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0],
                    87794427,
                    1124697008162.3955,
                ),
                (
                    [0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028],
                    78226988,
                    1151439441529.0215,
                ),
                (
                    [0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574],
                    167273589,
                    693036940951.0249,
                ),
                (
                    [0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539],
                    148426180,
                    35942838893.32379,
                ),
                (
                    [0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707],
                    157533313,
                    88431531357.62982,
                ),
                (
                    [0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0],
                    118361306,
                    1111537045743.7646,
                ),
            ]

        benchmarkLogging = ["cpu", "disk", "network", "iostats", "jstack"]
        benchmarkLogging = ["cpu", "disk", "network", "iostats"]
        # IOStatus can hang?
        benchmarkLogging = ["cpu", "disk", "network"]
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="hdfs",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs
                )
            else:
                parseResult = h2i.import_parse(
                    path=csvPathname,
                    schema="local",
                    hex_key=hex_key,
                    timeoutSecs=timeoutSecs,
                    pollTimeoutSecs=60,
                    retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging,
                    doSummary=False,
                    **kwargs
                )

            elapsed = time.time() - start
            fileMBS = (totalBytes / 1e6) / elapsed
            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "Parse", csvPathname, fileMBS, elapsed
            )
            print "\n" + l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"], timeoutSecs=300)
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summary = h2o_cmd.runSummary(
                key=parseResult["destination_key"], numRows=numRows, numCols=numCols, timeoutSecs=300
            )
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                "k": 15,
                "max_iter": 500,
                # 'normalize': 1,
                "normalize": 0,  # temp try
                "initialization": "Furthest",
                "destination_key": "junk.hex",
                # we get NaNs if whole col is NA
                "ignored_cols": "C1",
                "normalize": 0,
                # reuse the same seed, to get deterministic results
                "seed": 265211114317615310,
            }

            if (trial % 3) == 0:
                kwargs["initialization"] = "PlusPlus"
            elif (trial % 3) == 1:
                kwargs["initialization"] = "Furthest"
            else:
                kwargs["initialization"] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeansResult = h2o_cmd.runKMeans(
                parseResult=parseResult, timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging, **kwargs
            )
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100
            )
            print "kmeans result:", h2o.dump_json(kmeansResult)

            l = "{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}".format(
                len(h2o.nodes),
                h2o.nodes[0].java_heap_GB,
                "KMeans",
                "trial " + str(trial),
                csvFilename,
                elapsed,
                paramsString,
            )
            print l
            h2o.cloudPerfH2O.message(l)

            # his does predict
            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                self, kmeansResult, csvPathname, parseResult, "d", **kwargs
            )
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01)
            # these clusters were sorted compared to the cluster order in training
            h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial)
            # why is the expected # of rows not right in KMeans2. That means predictions are wrong
            h2o_kmeans.compareResultsToExpected(
                self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial
            )

            # the tupleResultList has the size during predict? compare it to the sizes during training
            # I assume they're in the same order.
            model = kmeansResult["model"]
            size = model["size"]
            size2 = [t[1] for t in tupleResultList]

            if 1 == 1:  # debug
                print "training size:", size
                print "predict size2:", size2
                print "training sorted(size):", sorted(size)
                print "predict sorted(size2):", sorted(size2)
                print h2o.nodes[0].http_addr
                print h2o.nodes[0].port

            clusters = model["centers"]
            cluster_variances = model["within_cluster_variances"]
            error = model["total_within_SS"]
            iterations = model["iterations"]
            normalized = model["normalized"]
            max_iter = model["max_iter"]
            print "iterations", iterations

            if iterations >= (max_iter - 1):  # h2o hits the limit at max_iter-1..shouldn't hit it
                raise Exception(
                    "trial: %s KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s",
                    (trial, iterations, max_iter),
                )

            # this size stuff should be compared now in compareResultsToExpected()..leave it here to make sure

            # can't do this compare, because size2 is sorted by center order..
            # so we don't know how to reorder size the same way
            # we could just sort the two of them, for some bit of comparison.
            if sorted(size) != sorted(size2):
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as predict on same data: %s"
                    % (trial, size, size2)
                )

            # our expected result is sorted by cluster center ordered. but the sizes are from the predicted histogram
            expectedSize = [t[1] / SCALE_SIZE for t in expected]

            if size2 != expectedSize:
                raise Exception(
                    "trial: %s training cluster sizes: %s not the same as expected: %s" % (trial, size, expectedSize)
                )

            if DELETE_KEYS_EACH_ITER:
                h2i.delete_keys_at_all_nodes()
예제 #49
0
    def test_parse_bounds_csv(self):
        print "Random 0/1 for col1. Last has max col = 1, All have zeros for class."
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 100000, 'cB', 300),
            (1000, 1000, 'cA', 300),
            (1000, 999, 'cC', 300),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount,
                                                colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            # dict of col sums for comparison to exec col sums below
            synSumList = write_syn_dataset(csvPathname, rowCount, colCount,
                                           SEEDPERFILE)

            # PARSE**********************
            parseResult = h2i.import_parse(path=csvPathname,
                                           hex_key=hex_key,
                                           schema='put',
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # INSPECT*******************
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         max_column_display=colCount,
                                         timeoutSecs=timeoutSecs)
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            row_size = inspect['row_size']
            value_size_bytes = inspect['value_size_bytes']
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols), \
                "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                "    row_size:", "{:,}".format(row_size)

            expectedRowSize = num_cols * 1  # plus output
            expectedValueSize = expectedRowSize * num_rows
            self.assertEqual(row_size, expectedRowSize,
                msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                (row_size, expectedRowSize))
            self.assertEqual(value_size_bytes, expectedValueSize,
                msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                (value_size_bytes, expectedValueSize))

            iCols = inspect['cols']
            iColNameToOffset = {}
            for iColDict in iCols:
                # even though 'offset' exists, we'll use 'name' as the common key
                # to compare inspect and summary results
                iName = iColDict['name']
                iOffset = iColDict['offset']
                iColNameToOffset[iName] = iOffset
                # just touching to make sure they are there
                num_missing_values = iColDict['num_missing_values']
                iMin = float(iColDict['min'])
                iMax = float(iColDict['max'])
                iMean = float(iColDict['mean'])
                iVariance = float(iColDict['variance'])

            # SUMMARY********************************
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_column_display=colCount,
                                               timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(rowCount,
                             num_rows,
                             msg="generated %s rows, parsed to %s rows" %
                             (rowCount, num_rows))

            summary = summaryResult['summary']
            columnsList = summary['columns']
            self.assertEqual(
                colCount,
                len(columnsList),
                msg=
                "generated %s cols (including output).  summary has %s columns"
                % (colCount, len(columnsList)))

            for columns in columnsList:
                name = columns['name']
                iOffset = iColNameToOffset[name]
                iColDict = iCols[iOffset]

                iMin = iColDict['min']
                iMax = iColDict['max']
                iMean = iColDict['mean']
                iVariance = iColDict['variance']
                iNumMissingValues = iColDict['num_missing_values']

                # from the summary
                N = columns['N']
                stype = columns['type']

                histogram = columns['histogram']
                bin_size = histogram['bin_size']
                bin_names = histogram['bin_names']
                bins = histogram['bins']
                nbins = histogram['nbins']

                smax = columns['max']
                smin = columns['min']
                smean = columns['mean']
                sigma = columns['sigma']
                na = columns['na']
                # no zeroes if enum, but we're not enum here
                zeros = columns['zeros']

                self.assertEqual(
                    iMin, smin[0],
                    "inspect min %s != summary min %s" % (iMin, smin))
                self.assertEqual(
                    iMax, smax[0],
                    "inspect max %s != summary max %s" % (iMax, smax))
                self.assertEqual(
                    iMean, smean,
                    "inspect mean %s != summary mean %s" % (iMean, smean))
                self.assertEqual(
                    iVariance, sigma,
                    "inspect variance %s != summary sigma %s" %
                    (iVariance, sigma))
                self.assertEqual(
                    iNumMissingValues, na,
                    "inspect num_missing_values %s != summary na %s" %
                    (iNumMissingValues, na))
                # no comparison for 'zeros'

                # now, also compare expected values
                if name == "V1":
                    synNa = 0
                    # can reverse-engineer the # of zeroes, since data is always 1
                    synSum = synSumList[
                        1]  # could get the same sum for all ccols
                    synZeros = num_rows - synSum
                    synSigma = 0.50
                    synMean = (synSum + 0.0) / num_rows
                    synMin = [0.0, 1.0]
                    synMax = [1.0, 0.0]

                elif name == "V2":
                    synSum = 0
                    synSigma = 0
                    synMean = 0
                    if DO_NAN:
                        synZeros = 0
                        synNa = num_rows
                        synMin = []
                        synMax = []
                    else:
                        synZeros = num_rows
                        synNa = 0
                        synMin = [0.0]
                        synMax = [0.0]

                # a single 1 in the last col
                elif name == "V" + str(colCount - 1):  # h2o puts a "V" prefix
                    synNa = 0
                    synSum = synSumList[colCount - 1]
                    synZeros = num_rows - 1
                    # stddev.p
                    # http://office.microsoft.com/en-us/excel-help/stdev-p-function-HP010335772.aspx

                    synMean = 1.0 / num_rows  # why does this need to be a 1 entry list
                    synSigma = math.sqrt(pow((synMean - 1), 2) / num_rows)
                    print "last col with single 1. synSigma:", synSigma
                    synMin = [0.0, 1.0]
                    synMax = [1.0, 0.0]

                else:
                    synNa = 0
                    synSum = 0
                    synZeros = num_rows
                    synSigma = 0.0
                    synMean = 0.0
                    synMin = [0.0]
                    synMax = [0.0]

                if DO_MEAN:
                    self.assertAlmostEqual(
                        float(smean),
                        synMean,
                        places=6,
                        msg='col %s mean %s is not equal to generated mean %s'
                        % (name, smean, synMean))

                # why are min/max one-entry lists in summary result. Oh..it puts N min, N max
                self.assertTrue(
                    smin >= synMin,
                    msg='col %s min %s is not >= generated min %s' %
                    (name, smin, synMin))

                self.assertTrue(
                    smax <= synMax,
                    msg='col %s max %s is not <= generated max %s' %
                    (name, smax, synMax))

                # reverse engineered the number of zeroes, knowing data was always 1 if present?
                if name == "V65536" or name == "V65537":
                    print "columns around possible zeros mismatch:", h2o.dump_json(
                        columns)

                self.assertEqual(
                    na,
                    synNa,
                    msg='col %s na %s is not equal to generated na %s' %
                    (name, na, synNa))

                self.assertEqual(
                    zeros,
                    synZeros,
                    msg='col %s zeros %s is not equal to generated zeros %s' %
                    (name, zeros, synZeros))

                self.assertEqual(stype,
                                 'number',
                                 msg='col %s type %s is not equal to %s' %
                                 (name, stype, 'number'))

                # our random generation will have some variance for col 1. so just check to 2 places
                if synSigma:
                    self.assertAlmostEqual(
                        float(sigma),
                        synSigma,
                        delta=0.03,
                        msg='col %s sigma %s is not equal to generated sigma %s'
                        % (name, sigma, synSigma))
예제 #50
0
    def test_c10_glm_fvec(self):
        h2o.beta_features = True
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        csvFilename = 'classification1Train.txt'
        csvPathname = importFolderPath + "/" + csvFilename

        start = time.time()

        # hack. force it to NA the header, so we have col names that are not customer senstive below
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False, header=0)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        numRows = inspect['numRows']
        numCols = inspect['numCols']

        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # keepList = []
        # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList)
        # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices
        
        ignore_x = []
        x = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
        for i in range(numCols):
            if i not in x:
                ignore_x.append(i)

        # since we're no long zero based, increment by 1
        ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

        
        # GLM Train***********************************************************
        keepPattern = None
        y = 0
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "x:", x
        print "ignore_x:", x

        kwargs = {
            'response': y,
            'ignored_cols': ignore_x,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 10,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
        elapsed = time.time() - start
        print "glm completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        # Parse Test***********************************************************
        GLMModel = glm['glm_model']
        modelKey = GLMModel['_key']

        csvFilename = 'classification1Test.txt'
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"
예제 #51
0
    def test_xl_ast_assert_ZZ(self):
        #*****************************************
        a = DF('a1') # inits to -1
        checkAst(astForInit(a))
        # I suppose use of the h2o inspect request is deprecated
        # h2o_cmd.runInspect uses Frames?
        if 1==0:
            inspect = h2o.n0.inspect(key=a) # str(a) becomes 'a1'. so this param should take type Key for key=
            print "a/a1:", dump_json(inspect)

        # let's use runSummary for fun..returns OutputObj for the col
        # will get from column 0, since column not specified
        summaryResult = h2o_cmd.runSummary(key=a)
        co = h2o_cmd.infoFromSummary(summaryResult)
        print "co.label:", co.label
        print "co.data:", co.data

        # how can we get a bunch of data?
        b = DF('b1') # inits to -1
        checkAst(astForInit(b))
        c = DF('c1') # inits to -1
        checkAst(astForInit(c))
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        h2p.yellow_print("Assign compare1")
        Assign(c[0], c[0] + 0)
        checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare2")
        Assign(c[0], c[0] - 0)
        checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare3")
        Assign(c[0], c[0] == 0)
        checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare4")
        Assign(c[0], c[0] != 0)
        checkAst("(= ([ %c1 #0 #0) (N ([ %c1 #0 #0) #0))")

        # h2o_xl.debugPrintEnable = True

        #*****************************************
        c = DF('c1')

        h2p.yellow_print("<<= compare1")
        c[0] <<= (c[0] + 0)
        checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))")

        h2p.yellow_print("<<= compare2")
        c[0] <<= (c[0] - 0)
        checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))")

        h2p.yellow_print("<<= compare3")
        c[0] <<= (c[0] == 0)
        checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))")

        #*****************************************
        c = DF('c1') # inits to -1
        h2p.yellow_print("compare1")
        # doesn't assign result to a key?, gets result if scalar, otherwise gets a list or ??? 
        # .result can give us scalar, list, Key, None

        # .result could be a property that triggers a csv download, if we didn't cache the scalar/list result because it was small?
        # i.e. check if .result_cached was None, when .result property is used (property to avoid the need for ()
        result = Expr(c[0] == -1).result
        checkAst("(n ([ %c1 #0 #0) #-1)")
        h2p.yellow_print("Expr result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result))
        assert result == 1.0, "%s %s" % (type(result), result) # real result?

        if result:
            print "true for if of result", type(result), result
        else:
            print "else for if of result", type(result), result

        #*****************************************
        # difference is this goes to a temp key, so if not scalar, you can still get the results by looking at the key
        result = Assign(None, c[0]==-1).result
        checkAst("(= !knon_0x1a34250 (n ([ %c1 #0 #0) #-1))")
        h2p.yellow_print("Assign result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result))
        assert result == 1.0, "%s %s" % (type(result), result) # real result?

        if result:
            print "true if of result", result
        else:
            print "false if of result", result
예제 #52
0
    def test_parse_time_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_time.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        colCount = 6
        rowCount = 10
        headerData = rand_header(colCount)
        write_syn_dataset(csvPathname, rowCount, colCount, headerData)

        for trial in range (1):
            rowData = rand_rowData()
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key)
            print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsA = inspect['numRows']
            numColsA = inspect['numCols']

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100,
                numCols=numColsA, numRows=numRowsA, noPrint=True)

            print summaryResult
            h2o_cmd.infoFromSummary(summaryResult)
            (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)


            if constantValuesDictA or enumSizeDictA:
                raise Exception("Should be empty?  constantValuesDictA %s enumSizeDictA %s" % (constantValuesDictA, enumSizeDictA))

            print "missingValuesListA", missingValuesListA

            # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty")
            self.assertEqual(numColsA, colCount)
            self.assertEqual(numRowsA, rowCount)

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname)

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            # interesting. what happens when we do csv download with time data?
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key)
            print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsB = inspect['numRows']
            numColsB = inspect['numCols']
            print "missingValuesListB", missingValuesListB
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100,
                numCols=numColsB, numRows=numRowsB, noPrint=True)
            (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)
            if constantValuesDictB or enumSizeDictB:
                raise Exception("Should be empty?  constantValuesDictB %s enumSizeDictB %s" % (constantValuesDictB, enumSizeDictB))

            self.assertEqual(missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result")
            self.assertEqual(numColsA, numColsB,
                "numCols mismatches after re-parse of downloadCsv result")
            # H2O adds a header to the csv created. It puts quotes around the col numbers if no header
            # but in this dataset we have a header too, so the row counts should be equal
            # if not, maybe the parse of our dataset didn't detect a row
            self.assertEqual(numRowsA, numRowsB,
                "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB) )

            # FIX! should do some comparison of values? 
            # maybe can use exec to checksum the columns and compare column list.
            # or compare to expected values? (what are the expected values for the number for time inside h2o?)

            # FIX! should compare the results of the two parses. The infoFromInspect result?
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
예제 #53
0
    def test_four_billion_rows(self):
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            ("four_billion_rows.csv", "a.hex"),
            ("four_billion_rows.csv", "b.hex"),
        ]
        for (csvFilename, hex_key) in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            value_size_bytes = inspect['value_size_bytes']
            row_size = inspect['row_size']
            print "\n" + csvFilename, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols), \
                "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                "    row_size:", "{:,}".format(row_size)

            expectedRowSize = num_cols * 1  # plus output
            expectedValueSize = expectedRowSize * num_rows
            self.assertEqual(row_size, expectedRowSize,
                msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                (row_size, expectedRowSize))
            self.assertEqual(value_size_bytes, expectedValueSize,
                msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                (value_size_bytes, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(
                key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                num_cols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, num_cols))
            self.assertEqual(4 * 1000000000,
                             num_rows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, num_rows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'epsilon': 1e-6,
                'max_iter': 20,
                'cols': None,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'x': 0,
                'y': 1,
                'n_folds': 0,
                'case_mode': '=',
                'case': 1
            }
            # one coefficient is checked a little more
            colX = 0

            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, colX, **kwargs)
예제 #54
0
파일: h2o_kmeans.py 프로젝트: zhuyuecai/h2o
def bigCheckResults(self, kmeans, csvPathname, parseResult,
                    applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        # can't use inspect on a model key? now?
        model = kmeans['model']
        model_key = model['_key']
        centers = model['centers']
        cluster_variances = model["within_cluster_variances"]
        error = model["total_within_SS"]
        kmeansResult = kmeans
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        h2o.verboseprint('kmeans result:', h2o.dump_json(kmeansResult))
        model = kmeansResult['KMeansModel']
        centers = model['clusters']
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult['destination_key'],
            model_key=model_key,
            destination_key=predictKey)
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult['summaries'][0]['hcnt']  # histogram
        rows_per_cluster = hcnt
        # FIX! does the cluster order/naming match, compared to cluster variances
        sqr_error_per_cluster = cluster_variances

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult['destination_key'],
            model_key=model_key,
            destination_key=applyDestinationKey)
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(
            key=parseResult['destination_key'], model_key=model_key)
        score = kmeansScoreResult['score']
        rows_per_cluster = score['rows_per_cluster']
        sqr_error_per_cluster = score['sqr_error_per_cluster']

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(
            i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append(
            (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
예제 #55
0
    def test_four_billion_rows(self):
        h2o.beta_features = False
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            "four_billion_rows.csv",
        ]
        for csvFilename in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=180)
            elapsed = time.time() - start
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            # forget about checking the bytesize
            print "\n" + csvFilename, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols)

            expectedRowSize = num_cols * 1  # plus output
            # expectedValueSize = expectedRowSize * num_rows

            summaryResult = h2o_cmd.runSummary(
                key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                num_cols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, num_cols))
            self.assertEqual(4 * 1000000000,
                             num_rows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, num_rows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'cols': 'C1, C2',
                'initialization': 'Furthest',
                'max_iter': 4,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'y': 'C2',
                'n_folds': 0,
                'family': 'binomial',
                'case_mode': '=',
                'case': 1
            }
            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, 'C1', **kwargs)