Пример #1
0
def exec_expr_list_across_cols(lenNodes, exprList, keyX, 
    minCol=0, maxCol=54, timeoutSecs=10, incrementingResult=True):
    colResultList = []
    for colX in range(minCol, maxCol):
        for i, exprTemplate in enumerate(exprList):

            # do each expression at a random node, to facilate key movement
            # UPDATE: all execs are to a single node. No mixed node streams
            # eliminates some store/store race conditions that caused problems.
            # always go to node 0 (forever?)
            if lenNodes is None:
                execNode = 0
            else:
                ### execNode = random.randint(0,lenNodes-1)
                ### print execNode
                execNode = 0

            execExpr = fill_in_expr_template(exprTemplate, colX, colX, 0, keyX)
            if incrementingResult: # the Result<col> pattern
                resultKey = "Result"+str(colX)
            else: # assume it's a re-assign to self
                resultKey = keyX

            # kbn

            # v1
            # execResultInspect = exec_expr(h2o.nodes[execNode], execExpr, resultKey, timeoutSecs)
            # v2
            execResultInspect = exec_expr(h2o.nodes[execNode], execExpr, None, timeoutSecs)
            print "\nexecResult:", h2o.dump_json(execResultInspect)
            execResultKey = execResultInspect[0]['key']

            # v2: Exec2 'apply' can have no key field? (null) maybe just use keyX then
            if execResultKey:
                resultInspect = h2o_cmd.runInspect(None, execResultKey)
            else:
                resultInspect = h2o_cmd.runInspect(None, keyX)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")

            # min is keyword. shouldn't use.
            if incrementingResult: # a col will have a single min
                min_value = checkScalarResult(execResultInspect, resultKey)
                h2o.verboseprint("min_value: ", min_value, "col:", colX)
                print "min_value: ", min_value, "col:", colX
            else:
                min_value = None

            sys.stdout.write('.')
            sys.stdout.flush()

            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            # slows things down to check every iteration, but good for isolation
            if (h2o.check_sandbox_for_errors()):
                raise Exception(
                    "Found errors in sandbox stdout or stderr, on trial #%s." % trial)

        print "Column #", colX, "completed\n"
        colResultList.append(min_value)

    return colResultList
Пример #2
0
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
    destination_key = glmGridResult["destination_key"]
    inspectGG = h2o_cmd.runInspect(None, destination_key)
    h2o.verboseprint("Inspect of destination_key", destination_key, ":\n", h2o.dump_json(inspectGG))

    # FIX! currently this is all unparsed!
    type = inspectGG["type"]
    if "unparsed" in type:
        print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now"
        print "Run with -b arg to look at the browser output, for minimal checking of result"

    ### cols = inspectGG['cols']
    response = inspectGG["response"]  # dict
    ### rows = inspectGG['rows']
    value_size_bytes = inspectGG["value_size_bytes"]

    model0 = glmGridResult["models"][0]
    alpha = model0["alpha"]
    area_under_curve = model0["area_under_curve"]
    error_0 = model0["error_0"]
    error_1 = model0["error_1"]
    key = model0["key"]
    print "best GLM model key:", key

    glm_lambda = model0["lambda"]

    # now indirect to the GLM result/model that's first in the list (best)
    inspectGLM = h2o_cmd.runInspect(None, key)
    h2o.verboseprint("GLMGrid inspectGLM:", h2o.dump_json(inspectGLM))
    simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)
Пример #3
0
    def test_frame_split(self):
        h2o.beta_features = True

        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"

        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=10)

        print "Just split away and see if anything blows up"
        splitMe = hex_key
        inspect = h2o_cmd.runInspect(key=splitMe)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        for s in range(20):
            inspect = h2o_cmd.runInspect(key=splitMe)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5)
            split0_key = fs['split_keys'][0]
            split1_key = fs['split_keys'][1]
            split0_rows = fs['split_rows'][0]
            split1_rows = fs['split_rows'][1]
            split0_ratio = fs['split_ratios'][0]
            split1_ratio = fs['split_ratios'][1]
            print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split1_key
            # split should be within 1 row accuracy. let's say within 20 for now
            self.assertLess(abs(split1_rows - split0_rows), 2)
            self.assertEqual(numRows, (split1_rows + split0_rows))
            self.assertEqual(numCols, origNumCols)
            if split0_rows <= 2:
                break
Пример #4
0
    def test_parse_1m_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [(10, 65000, "cH", 30)]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            print "Summary should work with 65k"
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True
            )
            print csvFilename, "parse time:", parseResult["response"]["time"]
            print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, "    num_rows:", "{:,}".format(
                inspect["num_rows"]
            ), "    num_cols:", "{:,}".format(inspect["num_cols"])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect["num_cols"],
                colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount),
            )
            self.assertEqual(
                inspect["num_rows"],
                rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s"
                % (inspect["num_rows"], rowCount),
            )

            # we should obey max_column_display
            column_limits = [25, 25000, 50000]
            for column_limit in column_limits:
                inspect = h2o_cmd.runInspect(
                    None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs
                )
                self.assertEqual(
                    len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit)
                )
                for r in range(0, len(inspect["rows"])):
                    # NB: +1 below because each row includes a row header row: #{row}
                    self.assertEqual(
                        len(inspect["rows"][r]),
                        column_limit + 1,
                        "inspect data rows obeys max_column_display = " + str(column_limit),
                    )
Пример #5
0
    def test_rapids_basic(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'p'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for execExpr in exprList:
            r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr)
            resultKey = r.group(1)
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
            if DO_ROLLUP:
                h2o_cmd.runInspect(key=resultKey)
            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr)
            else:
                h2p.yellow_print("\nNo key created?\n", dump_json(execResult))

        print "\nExpressions that created keys. Shouldn't all of these expressions create keys"

        for k in keys:
            print k

        h2o.check_sandbox_for_errors()
def createTestTrain(srcKey, trainDstKey, testDstKey, percent, outputClass, numCols):
    # will have to live with random extract. will create variance
    print "train: get random %. change class 4 to 1, everything else to 0. factor() to turn real to int (for rf)"


    # Create complexity for no good reason!. Do the same thing 5 times in the single exec expressions
    execExpr = ""
    STUPID_REPEAT = 20
    for i in range(STUPID_REPEAT):
        execExpr += "a.hex=runif(%s);" % srcKey
        execExpr += "%s=%s[a.hex%s,];" % (trainDstKey, srcKey, '<=0.9')
        if not DO_MULTINOMIAL:
            execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, numCols, trainDstKey, numCols, outputClass)
            execExpr +=  "factor(%s[, %s]);" % (trainDstKey, numCols)

    h2o_exec.exec_expr(None, execExpr, resultKey=trainDstKey, timeoutSecs=STUPID_REPEAT * 15)

    inspect = h2o_cmd.runInspect(key=trainDstKey)
    h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) )

    print "test: same, but use the same runif() random result, complement"

    execExpr = "a.hex=runif(%s);" % srcKey
    execExpr += "%s=%s[a.hex%s,];" % (testDstKey, srcKey, '>0.9')
    if not DO_MULTINOMIAL:
        execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, numCols, testDstKey, numCols, outputClass)
        execExpr +=  "factor(%s[, %s])" % (testDstKey, numCols)
    h2o_exec.exec_expr(None, execExpr, resultKey=testDstKey, timeoutSecs=10)

    inspect = h2o_cmd.runInspect(key=testDstKey)
    h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (testDstKey, srcKey) )
Пример #7
0
    def test_one_hot_expand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 1100, 'cA', 5),
            (100, 1000, 'cB', 5),
            (100, 900, 'cC', 5),
            (100, 800, 'cD', 5),
            (100, 700, 'cE', 5),
            (100, 600, 'cF', 5),
            (100, 500, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # does it modify the original or ?
            oneHotResult = h2o.nodes[0].one_hot(source=parseResult['destination_key'])

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
Пример #8
0
def simpleCheckGBMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
    destination_key = glmGridResult['destination_key']
    inspectGG = h2o_cmd.runInspect(None, destination_key)
    h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG))

    # FIX! currently this is all unparsed!
    #type = inspectGG['type']
    #if 'unparsed' in type:
    #    print "Warning: GBM Grid result destination_key is unparsed, can't interpret. Ignoring for now"
    #    print "Run with -b arg to look at the browser output, for minimal checking of result"

    ### cols = inspectGG['cols']
    response = inspectGG['response'] # dict
    ### rows = inspectGG['rows']
    #value_size_bytes = inspectGG['value_size_bytes']

    model0 = glmGridResult['models'][0]
    alpha = model0['alpha']
    area_under_curve = model0['area_under_curve']
    error_0 = model0['error_0']
    error_1 = model0['error_1']
    model_key = model0['key']
    print "best GBM model key:", model_key

    glm_lambda = model0['lambda']

    # now indirect to the GBM result/model that's first in the list (best)
    inspectGBM = h2o_cmd.runInspect(None, model_key)
    h2o.verboseprint("GBMGrid inspectGBM:", h2o.dump_json(inspectGBM))
    simpleCheckGBM(self, inspectGBM, colX, allowFailWarning=allowFailWarning, **kwargs)
Пример #9
0
    def test_exec2_na_chop(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'airlines/year2013.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        inspect = h2o_cmd.runInspect(key='i.hex')
        print "\nr.hex" \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        numRows1 = inspect['numRows']
        numCols = inspect['numCols']

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, keyX='s.hex', maxTrials=200, timeoutSecs=30, maxCol=numCols-1)

        inspect = h2o_cmd.runInspect(key='s.hex')
        print "\ns.hex" \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        numRows2 = inspect['numRows']

        print numRows1, numRows2


        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Пример #10
0
    def test_nfold_frame_extract(self):

        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"

        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=30)

        print "Just nfold_frame_extract away and see if anything blows up"
        splitMe = hex_key
        inspect = h2o_cmd.runInspect(key=splitMe)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        for s in range(20):
            inspect = h2o_cmd.runInspect(key=splitMe)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # FIX! should check if afold is outside of nfold range allowance
            fs = h2o.nodes[0].nfold_frame_extract(source=splitMe, nfolds=2, afold=random.randint(0,1))
            print "fs", h2o.dump_json(fs)

            split0_key = fs['split_keys'][0]
            split1_key = fs['split_keys'][1]
            split0_rows = fs['split_rows'][0]
            split1_rows = fs['split_rows'][1]
            print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split0_key
            if split0_rows<=2:
                break

            print "Iteration", s
Пример #11
0
def bigCheckResults(self, kmeans, csvPathname, parseKey, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    model_key = kmeans['destination_key']
    kmeansResult = h2o_cmd.runInspect(key=model_key)
    centers = kmeansResult['KMeansModel']['clusters']

    kmeansApplyResult = h2o.nodes[0].kmeans_apply(
        data_key=parseKey['destination_key'], model_key=model_key,
        destination_key=applyDestinationKey)
    inspect = h2o_cmd.runInspect(None, applyDestinationKey)
    h2o_cmd.infoFromInspect(inspect, csvPathname)

    kmeansScoreResult = h2o.nodes[0].kmeans_score(
        key=parseKey['destination_key'], model_key=model_key)
    score = kmeansScoreResult['score']
    rows_per_cluster = score['rows_per_cluster']
    sqr_error_per_cluster = score['sqr_error_per_cluster']

    tupleResultList = []
    for i,c in enumerate(centers):
        print "\ncenters["+str(i)+"]: ", centers[i]
        print "rows_per_cluster["+str(i)+"]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster["+str(i)+"]: ", sqr_error_per_cluster[i]
        tupleResultList.append( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]) )

    return (centers, tupleResultList)
Пример #12
0
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
    destination_key = glmGridResult['destination_key']
    inspectGG = h2o_cmd.runInspect(None, destination_key)
    h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG))

    # FIX! currently this is all unparsed!
    #type = inspectGG['type']
    #if 'unparsed' in type:
    #    print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now"
    #    print "Run with -b arg to look at the browser output, for minimal checking of result"

    ### cols = inspectGG['cols']
    response = inspectGG['response'] # dict
    ### rows = inspectGG['rows']
    #value_size_bytes = inspectGG['value_size_bytes']

    # FIX! does error_0/1 only exist for binomial?
    for m, model in enumerate(glmGridResult['models']):
        alpha = model['alpha']
        area_under_curve = model['area_under_curve']
        # FIX! should check max error?
        error_0 = model['error_0']
        error_1 = model['error_1']
        model_key = model['key']
        print "#%s GLM model key: %s" % (m, model_key)
        glm_lambda = model['lambda']

    # now indirect to the GLM result/model that's first in the list (best)
    inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key'])
    h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM))
    g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)

    return g
Пример #13
0
    def test_parse_fs_schmoo_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        # rowData = "1,0,65,1,2,1,1.4,0,6"
        rowData = "1,0,65,1,2,1,1,0,6"

        totalRows = 99860
        write_syn_dataset(csvPathname, totalRows, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        print "Updating the key and hex_key names for each trial"
        for trial in range (200):
            append_syn_dataset(csvPathname, rowData)
            totalRows += 1

            start = time.time()
            key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "trial #", trial, "totalRows:", totalRows, "parse end on ", \
                csvFilename, 'took', time.time() - start, 'seconds'

            h2o_cmd.runInspect(key=hex_key)
            # only used this for debug to look at parse (red last row) on failure
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Пример #14
0
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
# "grid": {
#    "destination_keys": [
#        "GLMGridResults__8222a49156af52532a34fb3ce4304308_0", 
#        "GLMGridResults__8222a49156af52532a34fb3ce4304308_1", 
#        "GLMGridResults__8222a49156af52532a34fb3ce4304308_2"
#   ]
# }, 
    if h2o.beta_features:
        destination_key = glmGridResult['grid']['destination_keys'][0]
        inspectGG = h2o.nodes[0].glm_view(destination_key)
        models = inspectGG['glm_model']['submodels']
        h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(models[0]))
        g = simpleCheckGLM(self, inspectGG, colX, allowFailWarning=allowFailWarning, **kwargs)
    else:
        destination_key = glmGridResult['destination_key']
        inspectGG = h2o_cmd.runInspect(None, destination_key)
        h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG))
        models = glmGridResult['models']
        for m, model in enumerate(models):
            alpha = model['alpha']
            area_under_curve = model['area_under_curve']
            # FIX! should check max error?
            error_0 = model['error_0']
            error_1 = model['error_1']
            model_key = model['key']
            print "#%s GLM model key: %s" % (m, model_key)
            glm_lambda = model['lambda']

        # now indirect to the GLM result/model that's first in the list (best)
        inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key'])
        h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM))
        g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)
    return g
Пример #15
0
    def test_exec2_sum(self):
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        if getpass.getuser()=='jenkins':
            csvPathname = 'standard/billion_rows.csv.gz'
        else:
            csvPathname = '1B/reals_100000x1000_15f.data'
            csvPathname = '1B/reals_1B_15f.data'
            csvPathname = '1B/reals_1000000x1000_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2)
        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']
        inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
        print "inspect offset = -1:", h2o.dump_json(inspect)

        for execExpr in exprList:
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
            print 'exec took', time.time() - start, 'seconds'
            print "result:", result

        h2o.check_sandbox_for_errors()
Пример #16
0
    def test_sort_of_prostate_with_row_schmoo(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        rowData = "1,0,65,1,2,1,1.4,0,6"

        write_syn_dataset(csvPathname,      99860, headerData, rowData)

        print "This is the same format/data file used by test_same_parse, but the non-gzed version"
        print "\nSchmoo the # of rows"
        print "Updating the key and key2 names for each trial"
        for trial in range (200):
            append_syn_dataset(csvPathname, rowData)
            ### start = time.time()
            # this was useful to cause failures early on. Not needed eventually
            ### key = h2o_cmd.parseFile(csvPathname=h2o.find_file("smalldata/logreg/prostate.csv"))
            ### print "Trial #", trial, "parse end on ", "prostate.csv" , 'took', time.time() - start, 'seconds'

            start = time.time()
            key = csvFilename + "_" + str(trial)
            key2 = csvFilename + "_" + str(trial) + ".hex"
            key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2)
            print "trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'

            h2o_cmd.runInspect(key=key2)
            # only used this for debug to look at parse (red last row) on failure
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Пример #17
0
    def test_parse_summary_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        timeoutSecs = 300

        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        parseResult  = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList)
        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)

        csvPathname_test  = importFolderPath + '/persona_clean_deep.tsv.zip'
        validation_key = 'test.hex'
        parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList)

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)
Пример #18
0
    def test_NOPASS_create_frame_fail(self):
        h2o.beta_features = True

        for trial in range(20):
            kwargs = {'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0}

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', 
                schema='put', timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10)
            h2o_cmd.infoFromSummary(rSummary)

            print h2o.dump_json(cfResult)
    
            print "Trial #", trial, "completed"
Пример #19
0
    def test_GLM2_tnc3_10(self):
        h2o.beta_features = True
        csvFilename = 'tnc3_10.csv'
        print "\n" + csvFilename
        hex_key = "tnc3.hex"
        h2b.browseTheCloud()

        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='put', hex_key=hex_key, timeoutSecs=10)
        print "Parse result['Key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if (1==0):
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'response': 13, 'n_folds': 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseResult
            # in any case, the destination_key in parseResult was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds'


        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if (1==0):
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'response': 13, 'n_folds': 6}
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Пример #20
0
    def test_tnc3_ignore(self):
        csvFilename = 'tnc3_10.csv'
        csvPathname = h2o.find_file('smalldata/' + csvFilename)
        print "\n" + csvPathname
        key2 = "tnc3.hex"
        h2b.browseTheCloud()

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10)
        print "Parse result['Key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if (1==0):
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'y': 13, 'num_cross_validation_folds': 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseKey
            # in any case, the destination_key in parseKey was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'


        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if (1==0):
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'y': 13, 'num_cross_validation_folds': 6}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Пример #21
0
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        # can't use inspect on a model key? now?
        model = kmeans["model"]
        model_key = model["_key"]
        centers = model["centers"]
        cluster_variances = model["within_cluster_variances"]
        error = model["total_within_SS"]
        kmeansResult = kmeans
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        h2o.verboseprint("kmeans result:", h2o.dump_json(kmeansResult))
        model = kmeansResult["KMeansModel"]
        centers = model["clusters"]
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey
        )
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult["summaries"][0]["hcnt"]  # histogram
        rows_per_cluster = hcnt
        # FIX! does the cluster order/naming match, compared to cluster variances
        sqr_error_per_cluster = cluster_variances

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey
        )
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key)
        score = kmeansScoreResult["score"]
        rows_per_cluster = score["rows_per_cluster"]
        sqr_error_per_cluster = score["sqr_error_per_cluster"]

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
Пример #22
0
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        model_key = kmeans["model"]["_selfKey"]
        # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame

        # can't use inspect on a model key? now?
        kmeansResult = kmeans
        model = kmeansResult["model"]
        centers = model["clusters"]
        error = model["error"]
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        model = kmeansResult["KMeansModel"]
        centers = model["clusters"]
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey
        )
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult["summaries"][0]["hcnt"]  # histogram
        rows_per_cluster = hcnt
        # have to figure out how to get this with fvec
        sqr_error_per_cluster = [0 for h in hcnt]

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey
        )
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key)
        score = kmeansScoreResult["score"]
        rows_per_cluster = score["rows_per_cluster"]
        sqr_error_per_cluster = score["sqr_error_per_cluster"]

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", centers[i]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
Пример #23
0
    def test_rf_float_bigexp_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_prostate.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        totalRows = 1000
        colCount = 7
        write_syn_dataset(csvPathname, totalRows, colCount, headerData)

        for trial in range (5):
            # grow the data set
            rowData = rand_rowData(colCount)
            num = random.randint(4096, 10096)
            append_syn_dataset(csvPathname, colCount, num)
            totalRows += num

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            # On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
            ntree = 2
            kwargs = {
                'ntrees': ntree,
                'mtries': None,
                'max_depth': 20,
                'sample_rate': 0.67,
                'destination_key': None,
                'nbins': 1024,
                'seed': 784834182943470027,
            }
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, doSummary=True)
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numCols = inspect['numCols']

            start = time.time()
            rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=15, pollTimeoutSecs=5, **kwargs)
            print "trial #", trial, "totalRows:", totalRows, "num:", num, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree)

            # cm0 = rfView['drf_model']['cms'][0]['_arr']
            # print cm0
            # self.assertEqual(len(cm0), numCols,
            #     msg="%s cols in cm, means rf must have ignored some cols. I created data with %s cols" % (len(cm0), numCols-1))

            inspect = h2o_cmd.runInspect(key=hex_key)
            cols = inspect['cols']
            numCols = inspect['numCols']
            for i,c in enumerate(cols):
                if i < (numCols-1): # everything except the last col (output) should be 8 byte float
                    colType = c['type']
                    self.assertEqual(colType, 'Real', msg="col %d should be type Real: %s" % (i, colType))

            ### h2o_cmd.runInspect(key=hex_key)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
Пример #24
0
    def test_exec2_xorsum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 1, 'r1', 0, 10, None),
        ]

        ullResultList = []
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # dynamic range of the data may be useful for estimating error
            maxDelta = expectedMax - expectedMin

            csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            print "Creating random", csvPathname
            (expectedUll, expectedFpSum)  = write_syn_dataset(csvPathname, 
                rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key, 
                timeoutSecs=3000, retryDelaySecs=2)
            inspect = h2o_cmd.runInspect(key=hex_key)
            print "numRows:", inspect['numRows']
            print "numCols:", inspect['numCols']
            inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
            print "inspect offset = -1:", h2o.dump_json(inspect)

            
            # looking at the 8 bytes of bits for the h2o doubles
            # xorsum will zero out the sign and exponent
            for execExpr in exprList:
                start = time.time()
                (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, 
                    resultKey=None, timeoutSecs=300)
                print 'exec took', time.time() - start, 'seconds'
                print "execResult:", h2o.dump_json(execResult)
                print ""
                print "%30s" % "fpResult:", "%.15f" % fpResult
                ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                print "%30s" % "bitResult (0.16x):", "0x%0.16x" % ullResult
                print "%30s" % "expectedUll (0.16x):", "0x%0.16x" % expectedUll
                # print "%30s" % "hex(bitResult):", hex(ullResult)
                ullResultList.append((ullResult, fpResult))

            h2o.check_sandbox_for_errors()

            print "first result was from a sum. others are xorsum"
            print "ullResultList:"
            for ullResult, fpResult in ullResultList:
                print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
            expectedUllAsDouble = h2o_util.unsignedLongLongToDouble(expectedUll)
            print "%30s" % "expectedUll (0.16x):", "0x%0.16x   %s" % (expectedUll, expectedUllAsDouble)
            expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)
            print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
Пример #25
0
    def test_KMeans_params_rand2(self):
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        print "\nUsing random seed:", SEED

        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 400),
                ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 400),
                ('covtype200x.data', 2000),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params()
            for trial in range(3):
                randomV = paramDict['k']
                k = random.choice(randomV)

                randomV = paramDict['epsilon']
                epsilon = random.choice(randomV)

                randomV = paramDict['cols']
                cols = random.choice(randomV)

                kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'}
                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Пример #26
0
    def test_exec2_row_range(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [(1000000, 5, "cA", 200)]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False
            )
            print "Parse:", parseResult["destination_key"], "took", time.time() - start, "seconds"

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
                inspect["numCols"]
            )

            # should match # of cols in header or ??
            self.assertEqual(
                inspect["numCols"],
                colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect["numCols"], colCount),
            )
            self.assertEqual(
                inspect["numRows"],
                rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s"
                % (inspect["numRows"], rowCount),
            )

            REPEAT = 1
            for i in range(REPEAT):
                hex_key_i = hex_key + "_" + str(i)
                execExpr = "%s=%s[1,]" % (hex_key_i, hex_key)
                resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, 100)
                resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, rowCount - 10)
                resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                inspect = h2o_cmd.runInspect(None, hex_key_i, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromInspect(inspect, hex_key_i)
                print "\n" + hex_key_i, "    numRows:", "{:,}".format(
                    inspect["numRows"]
                ), "    numCols:", "{:,}".format(inspect["numCols"])
Пример #27
0
    def test_rf_hhp_2a_fvec(self):
        h2o.beta_features = True
        csvFilenameList = {
            'hhp.cut3.214.data.gz',
            }

        for csvFilename in csvFilenameList:
            csvPathname = csvFilename
            print "RF start on ", csvPathname
            dataKeyTrain = 'rTrain.hex'
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=dataKeyTrain, schema='put',
                timeoutSecs=120)            
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numCols = inspect['numCols']

            # we want the last col. Should be values 0 to 14. 14 most rare

            # from the cut3 set
            #   84777 0
            #   13392 1
            #    6546 2
            #    5716 3
            #    4210 4
            #    3168 5
            #    2009 6
            #    1744 7
            #    1287 8
            #    1150 9
            #    1133 10
            #     780 11
            #     806 12
            #     700 13
            #     345 14
            #    3488 15

            execExpr = "%s[,%s] = %s[,%s]==14" % (dataKeyTrain, numCols, dataKeyTrain, numCols)
            h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10)
            inspect = h2o_cmd.runInspect(key=dataKeyTrain)
            h2o_cmd.infoFromInspect(inspect, "going into RF")
            execResult = {'destination_key': dataKeyTrain}


            kwargs = {
                'ntrees': 20,
                'max_depth': 20,
                'nbins': 50,
            }
            rfView = h2o_cmd.runRF(parseResult=execResult, timeoutSecs=900, retryDelaySecs=10, **kwargs)
            print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds'
            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
Пример #28
0
    def test_create_frame_rand1(self):
        h2o.beta_features = True
        # default
        params = {
            'rows': 1, 
            'cols': 1
        }
        for trial in range(20):
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)
            # h2o does some strick checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:   
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None
                
            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0


            kwargs = params.copy()

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', 
                schema='put', timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            h2o_cmd.runSummary(key='temp1000.hex')
            print h2o.dump_json(cfResult)
    
            print "Trial #", trial, "completed"
Пример #29
0
    def test_frame_split_balance(self):
        h2o.beta_features = True

        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"

        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20)

        print "Just split away and see if anything blows up"
        splitMe = hex_key
        inspect = h2o_cmd.runInspect(key=splitMe)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        for s in range(20):
            inspect = h2o_cmd.runInspect(key=splitMe)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5)
            split0_key = fs['split_keys'][0]
            split1_key = fs['split_keys'][1]
            split0_rows = fs['split_rows'][0]
            split1_rows = fs['split_rows'][1]
            split0_ratio = fs['split_ratios'][0]
            split1_ratio = fs['split_ratios'][1]
            print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split1_key
            # split should be within 1 row accuracy. let's say within 20 for now
            self.assertLess(abs(split1_rows - split0_rows), 2)
            self.assertEqual(numRows, (split1_rows + split0_rows))
            self.assertEqual(numCols, origNumCols)
            if split0_rows <= 2:
                break

            print "Now do some rebalancing on the split frames"
            for trial in range(2):
                rb_key = "rb_%s_%s" % (trial, splitMe)
                SEEDPERFILE = random.randint(0, sys.maxint)
                randChunks = random.randint(1, 100)
                start = time.time()
                print "Trial %s: Rebalancing %s to %s with %s chunks" % (trial, splitMe, rb_key, randChunks)
                rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, seed=SEEDPERFILE, chunks=randChunks)
                elapsed = time.time() - start
                print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\
                h2o_cmd.runSummary(key=rb_key)
                print "\nInspecting the original parsed result"
                inspect = h2o_cmd.runInspect(key=hex_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
                print "\nInspecting the rebalanced result with %s forced chunks" % randChunks
                inspect = h2o_cmd.runInspect(key=rb_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
Пример #30
0
    def test_tnc3_ignore(self):
        csvFilename = "tnc3.csv"
        csvPathname = h2o.find_file("smalldata/" + csvFilename)
        print "\n" + csvPathname
        key2 = "tnc3.hex"
        h2b.browseTheCloud()

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10, header=1)
        print "Parse result['Key']:", parseKey["destination_key"]
        inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if 1 == 0:
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10
            )
            print "\ncolResultList after num swap", colResultList

        if 1 == 1:
            print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser"
            print 'The good case with ignore="boat,body"'
            rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, ignore="boat,body", csvPathname=csvPathname)

        inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        # ******************
        if 1 == 0:
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10
            )
            print "\ncolResultList after char swap", colResultList

        if 1 == 1:
            print "\nNow the bad case (no ignore)"
            rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, csvPathname=csvPathname)

        inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Пример #31
0
    def test_parse_bounds_libsvm (self):
        print "Random 0/1 for col1. Last has max col = 1, All have zeros for class."
        ## h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 100, 'cA', 300),
            (100000, 100, 'cB', 300),
            (100, 100000, 'cC', 300),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

                parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=timeoutSecs, doSummary=False)
                print "Parse result['destination_key']:", parseKey['destination_key']
                inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs)
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']
                row_size = inspect['row_size']
                value_size_bytes = inspect['value_size_bytes']
                print "\n" + csvPathname, \
                    "    num_rows:", "{:,}".format(num_rows), \
                    "    num_cols:", "{:,}".format(num_cols), \
                    "    value_size_bytes:", "{:,}".format(value_size_bytes), \
                    "    row_size:", "{:,}".format(row_size)

                expectedRowSize = num_cols * 1 # plus output
                expectedValueSize = expectedRowSize * num_rows
                self.assertEqual(row_size, expectedRowSize,
                    msg='row_size %s is not expected num_cols * 1 byte: %s' % \
                    (row_size, expectedRowSize))
                self.assertEqual(value_size_bytes, expectedValueSize,
                    msg='value_size_bytes %s is not expected row_size * rows: %s' % \
                    (value_size_bytes, expectedValueSize))


                summaryResult = h2o_cmd.runSummary(key=key2, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(colNumberMax+1, num_cols, 
                    msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, num_cols))
                self.assertEqual(rowCount, num_rows, 
                    msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows))

                summary = summaryResult['summary']
                columnsList = summary['columns']
                self.assertEqual(colNumberMax+1, len(columnsList), 
                    msg="generated %s cols (including output).  summary has %s columns" % (colNumberMax+1, len(columnsList)))

                for columns in columnsList:
                    N = columns['N']
                    # self.assertEqual(N, rowCount)
                    name = columns['name']
                    stype = columns['type']

                    histogram = columns['histogram']
                    bin_size = histogram['bin_size']
                    bin_names = histogram['bin_names']
                    bins = histogram['bins']
                    nbins = histogram['bins']

                    # definitely not enums
                    zeros = columns['zeros']
                    na = columns['na']
                    smax = columns['max']
                    smin = columns['min']
                    mean = columns['mean']
                    sigma = columns['sigma']

                    # a single 1 in the last col
                    if name == "V" + str(colNumberMax): # h2o puts a "V" prefix
                        synZeros = num_rows - 1
                        synSigma = None # not sure..depends on the # rows somehow (0 count vs 1 count)
                        synMean = 1.0/num_rows # why does this need to be a 1 entry list
                        synMin = [0.0, 1.0]
                        synMax = [1.0, 0.0]
                    elif name == ("V1"):
                        # can reverse-engineer the # of zeroes, since data is always 1
                        synSum = synColSumDict[1] # could get the same sum for all ccols
                        synZeros = num_rows - synSum
                        synSigma = 0.50
                        synMean = (synSum + 0.0)/num_rows
                        synMin = [0.0, 1.0]
                        synMax = [1.0, 0.0]
                    else:
                        synZeros = num_rows
                        synSigma = 0.0
                        synMean = 0.0
                        synMin = [0.0]
                        synMax = [0.0]

                    # print zeros, synZeros
                    self.assertAlmostEqual(float(mean), synMean, places=6,
                        msg='col %s mean %s is not equal to generated mean %s' % (name, mean, 0))

                    # why are min/max one-entry lists in summary result. Oh..it puts N min, N max
                    self.assertEqual(smin, synMin,
                        msg='col %s min %s is not equal to generated min %s' % (name, smin, synMin))

                    # reverse engineered the number of zeroes, knowing data was always 1 if present?
                    if name == "V65536" or name == "V65537":
                        print "columns around possible zeros mismatch:", h2o.dump_json(columns)

                    self.assertEqual(zeros, synZeros,
                        msg='col %s zeros %s is not equal to generated zeros count %s' % (name, zeros, synZeros))

                    self.assertEqual(stype, 'number',
                        msg='col %s type %s is not equal to %s' % (name, stype, 'number'))

                    # our random generation will have some variance for col 1. so just check to 2 places
                    if synSigma:
                        self.assertAlmostEqual(float(sigma), synSigma, delta=0.03,
                            msg='col %s sigma %s is not equal to generated sigma %s' % (name, sigma, synSigma))

                    if CHECK_MAX:
                        self.assertEqual(smax, synMax,
                            msg='col %s max %s is not equal to generated max %s' % (name, smax, synMax))

                    self.assertEqual(0, na,
                        msg='col %s num_missing_values %d should be 0' % (name, na))
    def test_exec2_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            (n, 10, 9, 'cE', 300),
        ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount),
                                     random.randint(1, iColCount))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings

                    # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                    # celChoice = str(random.choice(range(len(cel))))
                    celChoice = random.choice(range(len(cel)))
                    cutValue[c] = celChoice

                cutExprList = []

                pKey = Key('p')
                for i, c in enumerate(cutValue):
                    if c is None:
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # cutExprList.append('p$C'+str(i+1)+'=='+c)
                        # all column indexing in h2o-dev is with number
                        e = Fcn('==', c, pKey[:, i])
                        cutExprList.append(e)

                cutExpr = None
                for ce in cutExprList:
                    if cutExpr:
                        cutExpr = Fcn('&', cutExpr, ce)
                    else:
                        cutExpr = ce

                print "cutExpr:", cutExpr

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                # rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                hKey = Key(hex_key)
                rowExpr = hKey[cutExpr, :]

                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              iColCount,
                              oColCount,
                              SEEDPERFILE,
                              colEnumList=colEnumList)

            # PARSE*******************************************************
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)
            # print h2o.dump_json(inspect)

            # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
            #    h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False)

            # error if any col has constant values
            # if len(constantValuesDict) != 0:
            #    raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # build up the columns
            Assign('b', [1, 2, 3])
            # could also append 1 col at a time, by assigning to the next col number?
            Assign('a', Cbind(['b' for i in range(colCount)]))

            for eKey in eKeys:
                Assign(eKey, 'a')
                ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0, iColCount - 1)
                randOCol = random.randint(iColCount, iColCount + oColCount - 1)

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                if 1 == 1:
                    start = time.time()
                    Assign(fKey, random.choice(rowExprList)).do()
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."

                    inspect = h2o_cmd.runInspect(key=fKey)
                    missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                        inspect)

                if numRows == 0 or numCols != colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows,
                                  "rows and", numCols,
                                  "cols. Quantile will abort")

                # FIX! put quantile back in?
                quantileTime = 0

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)

        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Пример #33
0
    def test_NN2_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 90
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols': None,
            'response': response,
            'classification': 1,
            'activation': 'RectifierWithDropout',
            'input_dropout_ratio': 0.2,
            'hidden': '117,131,129',
            'rate': 0.005,
            'rate_annealing': 1e-6,
            'momentum_start': 0.5,
            'momentum_ramp': 100000,
            'momentum_stable': 0.9,
            'l1': 0.00001,
            'l2': 0.0000001,
            'seed': 98037452452,
            'loss': 'CrossEntropy',
            'max_w2': 15,
            'initial_weight_distribution': 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs': 20.0,
            'destination_key': model_key,
            'validation': validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03  ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        #### Now score using the model, and check the validation error
        expectedErr = 0.04
        relTol = 0.01
        predict_key = 'Predict.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60))
            elapsed = time.time() - start
            print "parse end on ", hex_key, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            kwargs = {
                'cols': None,
                'initialization': 'Furthest',
                'k': 12
            }

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)
            inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
            print h2o.dump_json(inspect)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \

if __name__ == '__main__':
    h2o.unit_main()
Пример #35
0
    def test_many_fp_formats_libsvm(self):
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 30, 'sparse50'),
            (100, 10, 'cB', 30, 'sparse'),
            (100000, 100, 'cC', 30, 'sparse'),
            (1000, 10, 'cD', 30, 'sparse50'),
            (100, 100, 'cE', 30, 'sparse'),
            (100, 100, 'cF', 30, 'sparse50'),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs, distribution) in tryList:
            # for sel in range(48): # len(caseList)
            for sel in [random.randint(0, 47)]:  # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel,
                                                       rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict,
                 colNumberMax) = write_syn_dataset(csvPathname, rowCount,
                                                   colCount, SEEDPERFILE, sel,
                                                   distribution)

                selKey2 = key2 + "_" + str(sel)
                parseKey = h2o_cmd.parseFile(None,
                                             csvPathname,
                                             key2=selKey2,
                                             timeoutSecs=timeoutSecs)
                print csvFilename, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey[
                    'destination_key']
                inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseKey['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0, key=parseKey['destination_key'], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2,
                                                       timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(
                    colNumberMax + 1,
                    num_cols,
                    msg=
                    "generated %s cols (including output).  parsed to %s cols"
                    % (colNumberMax + 1, num_cols))

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                # use num_cols?. num_cols should be <= colCount.

                colSumList = h2e.exec_expr_list_across_cols(
                    None,
                    exprList,
                    selKey2,
                    maxCol=colNumberMax + 1,
                    timeoutSecs=timeoutSecs)

                self.assertEqual(rowCount,
                                 num_rows,
                                 msg="generated %s rows, parsed to %s rows" %
                                 (rowCount, num_rows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k, v in synColSumDict.iteritems():
                    if k > colNumberMax:  # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(
                        k >= 0 and k < len(colSumList),
                        msg="k: %s len(colSumList): %s num_cols: %s" %
                        (k, len(colSumList), num_cols))

                    syn = {}
                    if k == 0:
                        syn['name'] = "Target"
                        syn['size'] = {
                            1, 2
                        }  # can be two if we actually used the full range 0-255 (need extra for h2o NA)
                        syn['type'] = {'int'}
                        syn['min'] = classMin
                        syn['max'] = classMax
                        # don't check these for the col 0 'Target'
                        syn['scale'] = {1}
                        # syn['base'] = 0
                        # syn['variance'] = 0
                    elif k == 1:  # we forced this to always be 0
                        syn['name'] = "V" + str(k)
                        syn['size'] = {1}
                        syn['type'] = {'int'}
                        syn['min'] = 0
                        syn['max'] = 0
                        syn['scale'] = {1}
                        syn['base'] = 0
                        syn['variance'] = 0
                    else:
                        syn['name'] = "V" + str(k)
                        syn['size'] = {
                            1, 2, 4, 8
                        }  # can be 2, 4 or 8? maybe make this a set for membership check
                        syn['type'] = {'int', 'float'}
                        syn['min'] = valMin
                        syn['max'] = valMax
                        syn['scale'] = {1, 10, 100, 1000}
                        # syn['base'] = 0
                        # syn['variance'] = 0

                    syn['num_missing_values'] = 0
                    syn['enum_domain_size'] = 0
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect['cols'][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but
                        # we can check that it's a subset of the allowed range
                        if synKey == 'min':
                            self.assertTrue(
                                syn[synKey] <= cols[synKey],
                                msg='col %s %s %s should be <= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'max':
                            self.assertTrue(
                                syn[synKey] >= cols[synKey],
                                msg='col %s %s %s should be >= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'size' or synKey == 'scale' or synKey == 'type':
                            if cols[synKey] not in syn[synKey]:
                                # for debug of why it was a bad size
                                print "cols size/min/max:", cols['size'], cols[
                                    'min'], cols['max']
                                print "syn size/min/max:", syn['size'], syn[
                                    'min'], syn['max']
                                raise Exception(
                                    'col %s %s %s should be in this allowed %s'
                                    % (k, synKey, cols[synKey], syn[synKey]))
                        else:
                            self.assertEqual(
                                syn[synKey],
                                cols[synKey],
                                msg='col %s %s %s should be %s' %
                                (k, synKey, cols[synKey], syn[synKey]))

                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(
                        float(v),
                        colSum,
                        places=0,
                        msg='%0.6f col sum is not equal to expected %0.6f' %
                        (v, colSum))
Пример #36
0
    def test_parse_multi_exclude_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]
        tryList = [
            (300, 100, 'cA', 60, '*x[2-5]*'),
            (310, 200, 'cB', 60, '*x[1,3-5]*'),
            (320, 300, 'cC', 60, '*x[1-2,4-5]*'),
            (330, 400, 'cD', 60, '*x[1-3-5]*'),
            (340, 500, 'cE', 60, '*x[1-4]*'),
        ]

        ## h2b.browseTheCloud()
        cnum = 0
        # create them all first
        for (rowCount, colCount, hex_key, timeoutSecs,
             excludePattern) in tryList:
            cnum += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            for fileN in range(FILENUM):
                csvFilename = 'syn_' + str(fileN) + "_" + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount, SEED,
                                  translateList)

        for (rowCount, colCount, hex_key, timeoutSecs,
             excludePattern) in tryList:
            cnum += 1
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                print f
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put')

            # pattern match all, then use exclude
            parseResult = h2i.parse_only(pattern="*syn_*",
                                         hex_key=hex_key,
                                         exclude=excludePattern,
                                         header=1,
                                         timeoutSecs=timeoutSecs)
            print "parseResult['destination_key']: " + parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # FIX! h2o strips one of the headers, but treats all the other files with headers as data
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            print "\n" + parseResult['destination_key'] + ":", \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # all should have rowCount rows (due to the excludePattern
            self.assertEqual(numRows, rowCount*FILENUM, msg=("got numRows: %s. Should be rowCount: %s * FILENUM: %s" % \
                (numRows, rowCount, FILENUM)))
Пример #37
0
    def test_rf_covtype_train_oobe2(self):
        print "\nUse randomBitVector and filter to separate the dataset randomly"
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"

        h2i.setupImportFolder(None, importFolderPath)
        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None,
                                             csvFilename,
                                             importFolderPath,
                                             key2=key2,
                                             header=0,
                                             timeoutSecs=100)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0, 11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        expectTrainPctRightList = [
            0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79
        ]
        expectScorePctRightList = [
            0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78
        ]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        dataKeyTrain = "rTrain"
        # start at 90% rows + 1
        # randomBitVector(size,selected)
        # randomFilter(srcFrame,rows,seed)
        # filter(srcFrame,bitVect)

        # odd. output is byte, all other exec outputs are 8 byte? (at least the ones below?)
        execExpr = "rbv=randomBitVector(" + str(num_rows) + "," + str(
            last10) + ",12345)"
        h2o_exec.exec_expr(None, execExpr, resultKey="rbv", timeoutSecs=10)

        # complement the bit vector
        execExpr = "not_rbv=colSwap(rbv,0,rbv[0]==0?1:0)"
        h2o_exec.exec_expr(None, execExpr, resultKey="not_rbv", timeoutSecs=10)

        execExpr = dataKeyTest + "=filter(" + key2 + ",rbv)"
        h2o_exec.exec_expr(None,
                           execExpr,
                           resultKey=dataKeyTest,
                           timeoutSecs=10)

        execExpr = dataKeyTrain + "=filter(" + key2 + ",not_rbv)"
        h2o_exec.exec_expr(None,
                           execExpr,
                           resultKey=dataKeyTrain,
                           timeoutSecs=10)

        ### time.sleep(3600)
        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]

        for trial in range(1, 10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial % 10]
            resultKey = "r" + str(trial)
            execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(
                rowsToUse) + ")"
            # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)"
            h2o_exec.exec_expr(None,
                               execExpr,
                               resultKey=resultKey,
                               timeoutSecs=10)
            parseKey['destination_key'] = resultKey

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + str(trial)

            rfv = h2o_cmd.runRFOnly(parseKey=parseKey,
                                    timeoutSecs=timeoutSecs,
                                    **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            oobeTrainPctRight = 100 * (
                1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]),
                delta=0.2)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%"
            # pop the stuff from kwargs that were passing as params
            model_key = rfv['model_key']
            kwargs.pop('model_key', None)

            data_key = rfv['data_key']
            kwargs.pop('data_key', None)

            ntree = rfv['ntree']
            kwargs.pop('ntree', None)
            # scoring
            # RFView.html?
            # dataKeyTest=a5m.hex&
            # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628&
            # response_variable=1&
            # ntree=50&
            # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0&
            # out_of_bag_error_estimate=1&
            # http://192.168.1.28:54321/GeneratePredictionsPage.html?model_key=__RFModel_0e2531bc-2552-4f65-8a4a-843031b0cb99&key=iris
            # http://192.168.1.28:54321/RFView.html?data_key=iris.hex&model_key=__RFModel_0e2531bc-2552-4f65-8a4a-843031b0cb99&ntree=50&response_variable=4&class_weights=Iris-setosa%3D1.0%2CIris-versicolor%3D1.0%2CIris-virginica%3D1.0&out_of_bag_error_estimate=true&iterative_cm=true

            kwargs['iterative_cm'] = 1
            # do full scoring
            kwargs['out_of_bag_error_estimate'] = 0
            rfv = h2o_cmd.runRFView(None,
                                    dataKeyTest,
                                    model_key,
                                    ntree,
                                    timeoutSecs,
                                    retryDelaySecs=1,
                                    print_params=True,
                                    **kwargs)

            h2o.nodes[0].generate_predictions(model_key=model_key,
                                              data_key=dataKeyTest)

            fullScorePctRight = 100 * (
                1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), fullScorePctRight, expectScorePctRightList[trial]),
                delta=0.2)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (
                rowsToUse * 100.0 / num_rows), "pct. of all rows"

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectTrainPctRightList, actualTrainPctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectScorePctRightList, actualScorePctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp
Пример #38
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                'ntrees': 3,
                'max_depth': 10,
                'nbins': 50,
                'timeoutSecs': 600,
                'response': 'C54',
                'classification': 1,
            }

            paramsScoreRF = {
                'vactual': 'C54',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = {
                'use_non_local_data': 1,
                'ntree': 10,
                'depth': 300,
                'bin_limit': 20000,
                'stat_type': 'ENTROPY',
                'out_of_bag_error_estimate': 1,
                'exclusive_split_limit': 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used)
                # in case a dataset doesn't use last col
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0,
            }

        # train1
        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult1,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult1,
                                  noPrint=False,
                                  **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult2,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult2,
                                  noPrint=False,
                                  **kwargs)

        if 1 == 0:
            print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(trainResult1,
                                   trainResult2,
                                   with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)

            print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(scoreResult1,
                                   scoreResult2,
                                   with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)
Пример #39
0
    def test_xl_basic(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexDF = 'v'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexDF)

        # uses h2o_xl to do magic with Rapids
        # does this DFInit to rows=0 now?
        a = DF('a1')  # knon_* key
        assert isinstance(a, DF)
        assert isinstance(a, Key)
        assert isinstance(a, Xbase)
        assert not isinstance(a, KeyIndexed)
        assert not isinstance(a, Fcn)
        assert not isinstance(a, Assign)

        # look at our secret stash in the base class. Should see the DFInit?
        print "Does the lastExecResult stash work?", dump_json(
            h2o_xl.Xbase.lastExecResult)
        # this should work if str(DF) returns DF.frame
        inspect = h2o_cmd.runInspect(key=a)
        # print "inspect a", dump_json(inspect)

        b = DF('b1')
        assert isinstance(b, DF)
        inspect = h2o_cmd.runInspect(key=b)
        # print "inspect b", dump_json(inspect)

        Assign(a, [0, 0, 0])
        assert isinstance(a, Key)
        b <<= [0, 0, 0]
        assert isinstance(b, Key)
        # FIX! how come I have to create c here first for python
        # see here
        # http://eli.thegreenplace.net/2011/05/15/understanding-unboundlocalerror-in-python
        # is it too much to require c to exist first?
        # c = DF()
        # c <<= a + b

        # this will trigger ok?
        c = DF('c1')
        c <<= [0, 0, 0]
        assert isinstance(c, Key)
        # c[0] <<= a + b
        # Assign(lhs=c[0], rhs=(a + b))
        rhs = a + b
        Assign(c, rhs)
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= !c1 (+ %a1 %b1))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        rhs = a[0] + b[0]
        Assign(c[0], rhs)
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #0 #0)))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        Assign(c[1], (a[2] + b[2]))
        ast = h2o_xl.Xbase.lastExecResult['ast']
        astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))"
        assert ast == astExpected, "Actual: %s    Expected: %s" % (ast,
                                                                   astExpected)

        # assert ast = "(= !b1 (is.na (c {#0})))"

        assert isinstance(c, Key), type(c)

        inspect = h2o_cmd.runInspect(key=c)
        # # print "inspect c", dump_json(inspect)

        # DF inits the frame
        # if you just want an existing Key, say existing=True
        a = DF('a2')  # named data frame
        assert isinstance(a, DF)
        b = DF('b2')
        c = DF('c2')
        inspect = h2o_cmd.runInspect(key=c)
        # # print "inspect c", dump_json(inspect)

        a <<= 3
        b <<= 3
        c <<= 3
        c[0] <<= a[0] + b[0]
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        a = DF('a3')  # named data frame
        b = DF('b3')
        c = DF('c3')
        a <<= 4
        b <<= 4
        c <<= 4

        c[0] <<= a[0] - b[0]
        assert isinstance(c, Key)
        c[0] <<= a[0] * b[0]
        assert isinstance(c, Key)

        a = DF('a4')  # named data frame
        b = DF('b4')
        c = DF('c4')
        a <<= 5
        b <<= 5
        c <<= 5
        c[0] <<= (a[0] - b[0])
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        c[0] <<= (a[0] & b[0]) | a[0]
        assert isinstance(c, Key)
        inspect = h2o_cmd.runInspect(key=c)
        # print "inspect c", dump_json(inspect)

        # print "\nDoes the keyWriteHistoryList work?"
        for k in Xbase.keyWriteHistoryList:
            print k

        h2o.check_sandbox_for_errors()
Пример #40
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz',
                 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex'
                 )
            ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800,
                 None, 'file_1[0-9].dat.gz', 'file_10_test.hex')
            ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            h2o.beta_features = False  #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema='s3n',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                noPoll=h2o.beta_features,
                                                doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500)

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               noPoll=h2o.beta_features,
                                               doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            print "Slow! exec is converting all imported keys?, not just what was parsed"
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey,
                                                           testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378
            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            for max_depth in [5, 10, 20, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    # 'ignored_cols':
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                noPoll=True,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                if doPredict:
                    predictKey = 'Predict.hex'
                    ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                    start = time.time()
                    gbmTestResult = h2o_cmd.runPredict(
                        data_key=parseTestResult['destination_key'],
                        model_key=modelKey,
                        destination_key=predictKey,
                        timeoutSecs=timeoutSecs)
                    # hack
                    if h2o.beta_features:
                        h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                         pollTimeoutSecs=timeoutSecs)
                    elapsed = time.time() - start
                    print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                    print "This is crazy!"
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual=response,
                        predict=predictKey,
                        vpredict='predict',  # choices are 0 and 'predict'
                    )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm)
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                    # xList.append(ntrees)
                    xList.append(max_depth)
                    eList.append(pctWrong)
                    fList.append(trainElapsed)

            h2o.beta_features = False

            if doPredict:
                xLabel = 'max_depth'
                eLabel = 'pctWrong'
                fLabel = 'trainElapsed'
                eListTitle = ""
                fListTitle = ""
                h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                                  fListTitle, fList, fLabel)
Пример #41
0
    def test_GLM_convergence_2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 1, 'cD', 300),
            # (100, 100, 'cE', 300),
            # (100, 200, 'cF', 300),
            # (100, 300, 'cG', 300),
            # (100, 400, 'cH', 300),
            # (100, 500, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        USEKNOWNFAILURE = False
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_%s_%sx%s.csv' % (SEEDPERFILE, rowCount,
                                                colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            if USEKNOWNFAILURE:
                csvFilename = 'failtoconverge_100x50.csv'
                csvPathname = 'logreg/' + csvFilename

            parseResult = h2i.import_parse(path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           schema='put')
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            y = colCount
            kwargs = {
                'max_iter': 40,
                'lambda': 1e0,
                'alpha': 0.5,
                'link': 'familyDefault',
                'n_folds': 0,
                'beta_epsilon': 1e-4,
                'thresholds': '0:1:0.01',
            }

            if USEKNOWNFAILURE:
                kwargs['y'] = 50
            else:
                kwargs['y'] = y

            emsg = None
            for i in range(3):
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
                print 'glm #', i, 'end on', csvPathname, 'took', time.time(
                ) - start, 'seconds'
                # we can pass the warning, without stopping in the test, so we can
                # redo it in the browser for comparison
                (warnings, coefficients,
                 intercept) = h2o_glm.simpleCheckGLM(self,
                                                     glm,
                                                     None,
                                                     allowFailWarning=True,
                                                     **kwargs)

                if 1 == 0:
                    print "\n", "\ncoefficients in col order:"
                    # since we're loading the x50 file all the time..the real colCount
                    # should be 50 (0 to 49)
                    if USEKNOWNFAILURE:
                        showCols = 50
                    else:
                        showCols = colCount
                    for c in range(showCols):
                        print "%s:\t%s" % (c, coefficients[c])
                    print "intercept:\t", intercept

                # gets the failed to converge, here, after we see it in the browser too
                x = re.compile("[Ff]ailed")
                if warnings:
                    print "warnings:", warnings
                    for w in warnings:
                        print "w:", w
                        if (re.search(x, w)):
                            # first
                            if emsg is None: emsg = w
                            print w
                if emsg: break

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(5)
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(5)

            # gets the failed to converge, here, after we see it in the browser too
            if emsg is not None:
                raise Exception(emsg)
Пример #42
0
    def test_GLM_100Mx70_hosts(self):
        # enable this if you need to re-create the file
        if 1 == 0:
            SYNDATASETS_DIR = h2o.make_syn_dir()
            createList = [
                (100000000, 70, 'cA', 10000),
            ]

            for (rowCount, colCount, hex_key, timeoutSecs) in createList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                    rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            # Have to copy it to /home/0xdiag/datasets!

        if localhost:
            csvFilenameList = [
                # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'),
                # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'),
                ('rand_logreg_100000000x70.csv', 500, 'rand_100Mx70.hex'),
            ]
        else:
            # None is okay for hex_key
            csvFilenameList = [
                # ('rand_logreg_500Kx70.csv.gz', 500, 'rand_500Kx70'),
                # ('rand_logreg_1Mx70.csv.gz', 500, 'rand_1Mx70'),
                ('rand_logreg_100000000x70.csv', 500, 'rand_100Mx70.hex'),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for csvFilename, timeoutSecs, hex_key in csvFilenameList:
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=2000,
                                           retryDelaySecs=5,
                                           initialDelaySecs=10,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols)

            y = num_cols - 1
            kwargs = {
                'family': 'binomial',
                'link': 'logit',
                'y': y,
                'max_iter': 8,
                'n_folds': 0,
                'beta_epsilon': 1e-4,
                'alpha': 0,
                'lambda': 0
            }

            for trial in range(3):
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
                elapsed = time.time() - start
                print "glm", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.',
                print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Пример #43
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data")
        csvPathname = find_file("smalldata/logreg/prostate.csv")
        import_result = h2o.n0.import_files(path=csvPathname)
        # print dump_json(import_result)

        k = import_result['keys'][0]
        frames_result = h2o.n0.frames(key=k)

        frame = frames_result['frames'][0]
        rows = frame['rows']
        columns = frame['columns']
        for c in columns:
            label = c['label']
            missing = c['missing_count']
            stype = c['type']
            domain = c['domain']

        # print dump_json(frame)

        # let's see what ray's util does
        frames = h2o.n0.frames()['frames']
        frames_dict = h2o_util.list_to_dict(frames, 'key/name')
        # print "frames:", dump_json(frames)
        # print "frames_dict:", dump_json(frames_dict)
        for k, v in frames_dict.items():
            print "frames_dict key:", k

        # interesting. we can do dictionary comprehensions
        # { k:v for k,v in my_dict.items() if 'Peter' in k }

        # how do you parse multiple files
        parse_result = h2o.n0.parse(
            key=k, intermediateResults=DO_INTERMEDIATE_RESULTS)

        frame = parse_result['frames'][0]
        hex_key = frame['key']['name']

        colCount = 9
        rowCount = 380
        # colCount = 11
        # rowCount = 1000000
        start = time.time()
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "Inspect:", hex_key, "took", time.time() - start, "seconds"
        numCols = len(inspect['frames'][0]['columns'])
        numRows = inspect['frames'][0]['rows']
        print "\n" + csvPathname, \
            "    rows:", "{:,}".format(numRows), \
            "    len(columns):", "{:,}".format(numCols)

        # should match # of cols in header or ??
        self.assertEqual(
            numCols, colCount,
            "parse created result with the wrong number of cols %s %s" %
            (numCols, colCount))
        self.assertEqual(numRows, rowCount,
            "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
            (numRows, rowCount))

        verboseprint(hex_key, ":", dump_json(parse_result))
Пример #44
0
    def test_GBM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if h2o.localhost:
            tryList = [
                (10000, 100, 'cA', 300), 
                ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300), 
                # (10000, 50, 'cC', 300), 
                (10000, 100, 'cD', 300), 
                (10000, 200, 'cE', 300), 
                (10000, 300, 'cF', 300), 
                (10000, 400, 'cG', 300), 
                (10000, 500, 'cH', 300), 
                (10000, 1000, 'cI', 300), 
                ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            hdrFilename = 'hdr_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'

            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)


            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, 
                doSummary=False)
            # hack

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']


            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            ntrees = 5 
            prefixList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
            # for max_depth in [5,10,20,40]:
            for max_depth in [5, 10, 20]:

                # PARSE a new header****************************************
                print "Creating new header", hdrPathname
                prefix = prefixList.pop(0)
                write_syn_header(hdrPathname, rowCount, colCount, prefix)

                # upload and parse the header to a hex

                hdr_hex_key = prefix + "_hdr.hex"
                parseHdrResult = h2i.import_parse(bucket=None, path=hdrPathname, schema='put',
                    header=1, # REQUIRED! otherwise will interpret as enums
                    hex_key=hdr_hex_key, timeoutSecs=timeoutSecs, doSummary=False)
                # Set Column Names (before autoframe is created)
                h2o.nodes[0].set_column_names(source=hex_key, copy_from=hdr_hex_key)

                # GBM
                print "response col name is changing each iteration: parsing a new header"
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': prefix + "_response",
                    'ignored_cols_by_name': None,
                }

                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

                # works if you delete the autoframe
                ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe')

        # just plot the last one
        if DO_PLOT:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Пример #45
0
    def test_quantile_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (100000, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (100000, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (100000, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (100000, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) 
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            h2o.beta_features = True
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i!=0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec))
                    h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i]))
                    if not result:
                        raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold))
                    h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, 
                        msg='exec percentile: %s too different from expected: %s' % (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2') 
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols,1)
                    self.assertEqual(numRows,len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                    )

            h2o.nodes[0].remove_all_keys()
Пример #46
0
    def test_c10_rel_glm(self):
        h2o.beta_features = False
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        csvFilename = 'classification1Train.txt'
        csvPathname = importFolderPath + "/" + csvFilename

        start = time.time()
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']
        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # keepList = []
        # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList)
        # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices
        
        # since we're no long zero based, increment by 1
        x_from_zero = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]

        x = ['C' + str(i + 1) for i in x_from_zero]
        # GLM Train***********************************************************
        keepPattern = None
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "from goodX (not used) x:", x
        print "y:", 'C1'

        # have to use named cols, and they start with 1
        


        kwargs = {
            'x' x,
            'y': 'C1',
            # 'case_mode': '>',
            # 'case': 0,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 4,
            'thresholds': 0.5,
            'n_folds': 1,
            'weight': 100,
            'beta_epsilon': 1.0E-4,
            }
Пример #47
0
    def test_exec2_xorsum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
                assert parse_key == hex_key
                assert numCols == colCount
                assert numRows == rowCount

                inspect = h2o_cmd.runInspect(key=hex_key)
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                assert len(missingList) == 0

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
        
                        if 1==0:
                            execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30)
                            fpResult = execResult['scalar']
                        else:
                            (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300)
                            # print dump_json(h2o.n0.frames(key="h"))

                        # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300)
                        # print dump_json(h2o.n0.frames(key="r1"))
                        print r, "execResult:", h2o.dump_json(execResult)
                        h2o_cmd.runStoreView()

                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult!=expectedUllSum:
                            raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \
                                (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \
                                (ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
Пример #48
0
    def test_exec2_log_like_R(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'airlines/year2013.csv'
        # csvPathname = '1B/reals_100000x1000_15f.data'
        # csvPathname = '1B/reals_1000000x1000_15f.data'
        # csvPathname = '1B/reals_1000000x1_15f.data'
        # csvPathname = '1B/reals_1B_15f.data'
        # csvPathname = '1B/reals_100M_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=3000,
                                       retryDelaySecs=2,
                                       doSummary=False)
        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']
        inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
        print "inspect offset = -1:", h2o.dump_json(inspect)

        xList = []
        eList = []
        fList = []
        for execExpr in initList:
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=300)
        for trial in range(300):
            for execExpr in exprList:
                # put the trial number into the temp for uniqueness
                execExpr = re.sub('Last.value', 'Last.value%s' % trial,
                                  execExpr)
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   execExpr,
                                                   resultKey=None,
                                                   timeoutSecs=300)
                execTime = time.time() - start
                print 'exec took', execTime, 'seconds'
                c = h2o.nodes[0].get_cloud()
                c = c['nodes']

                # print (h2o.dump_json(c))
                k = [i['num_keys'] for i in c]
                v = [i['value_size_bytes'] for i in c]

                print "keys: %s" % " ".join(map(str, k))
                print "value_size_bytes: %s" % " ".join(map(str, v))

                # print "result:", result
                if DO_ORIG:
                    if 'r1' in execExpr:
                        xList.append(trial)
                        eList.append(execTime)
                    if 'log' in execExpr:
                        fList.append(execTime)
                else:
                    xList.append(trial)
                    eList.append(execTime)
                    fList.append(execTime)

        h2o.check_sandbox_for_errors()
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            if DO_ORIG:
                eLabel = 'time: Last.value<trial>.4 = r1[,c(1)]'
                fLabel = 'time: Last.value<trial>.7 = log(Last.value<trial>.6)'
            else:
                eLabel = 'time: Last.value.3 = r2+1'
                fLabel = 'time: Last.value.3 = r2+1'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList,
                              xLabel,
                              eListTitle,
                              eList,
                              eLabel,
                              fListTitle,
                              fList,
                              fLabel,
                              server=True)
Пример #49
0
    def test_rf_libsvm_fvec(self):
        h2o.beta_features = True
        # just do the import folder once

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("gisette_scale.svm", "cF", 30, 1, 0),
            ("covtype.binary.svm", "cC", 30, 1, 1),
            ("mnist_train.svm", "cM", 30, 1, 1),
            # FIX! fails KMeansScore
            # not integer output
            # ("colon-cancer.svm",   "cA", 30, 1, 1),
            ("connect4.svm", "cB", 30, 1, 1),
            # ("syn_6_1000_10.svm",  "cK", 30, 1, 0),   Bad libsvm file has the same column multiple times.
            # float response requires regression
            ("syn_0_100_1000.svm", "cL", 30, 1, 0),
            ("mushrooms.svm", "cG", 30, 1, 1),
            # rf doesn't like reals
            # ("duke.svm",           "cD", 30, 1, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1, 1),
            # too big for rf (memory error)
            # ("news20.svm",         "cH", 30, 1, 1),

            # multiclass format ..don't support
            # ("tmc2007_train.svm",  "cJ", 30, 1, 1),
            # normal csv
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, hex_key, timeoutSecs, resultMult,
             classification) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            bucket = "home-0xdiag-datasets"
            csvPathname = "libsvm/" + csvFilename

            # PARSE******************************************
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            # RF******************************************
            kwargs = {
                'ntrees': 1,
                'response': 0,
                'classification': classification,
                'importance': 0,
            }

            timeoutSecs = 600
            start = time.time()
            rf = h2o_cmd.runRF(parseResult=parseResult,
                               timeoutSecs=timeoutSecs,
                               **kwargs)
            elapsed = time.time() - start
            print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
Пример #50
0
    def test_summary2_int2B(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100000, 1, 'B.hex', 2533255332, 2633256000, ('C1', None, None,
                                                          None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0))
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=60,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            if expected[0]:
                self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')

                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # apparently we can't estimate any more
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount,
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0
Пример #51
0
    def test_GLM2grid_covtype_many(self):
        h2o.beta_features = True
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put',
                                       timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "WARNING: max_iter set to 8 for benchmark comparisons"
        max_iter = 8

        y = "54"
        kwargs = {
            'response': y,
            'family': 'gaussian',
            'n_folds': 2,
            'max_iter': max_iter,
            'beta_epsilon': 1e-3,
            'lambda': '0,0.5,0.8',
            'alpha': '0,1e-8,1e-4',
        }

        start = time.time()
        jobs = []
        totalGLMGridJobs = 0
        for i in range(3):
            glmResult = h2o_cmd.runGLM(parseResult=parseResult,
                                       timeoutSecs=300,
                                       noPoll=True,
                                       **kwargs)

            # print "glmResult:", h2o.dump_json(glmResult)
            # assuming it doesn't complete right away, this is the first response
            # it differs for the last response
            job_key = glmResult['job_key']
            grid_key = glmResult['destination_key']
            jobs.append((job_key, grid_key))
            totalGLMGridJobs += 1

        # do some parse work in parallel. Don't poll for parse completion
        # don't bother checking the parses when they are completed (pollWaitJobs looks at all)
        for i in range(4):
            time.sleep(3)
            hex_key = str(i) + ".hex"
            src_key = str(i) + ".src"
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='put',
                                           src_key=src_key,
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           noPoll=True,
                                           doSummary=False)

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start

        # 2/GLMGridView.html?grid_key=asd
        # 2/GLMModelView.html?_modelKey=asd_0&lambda=NaN
        # 2/SaveModel.html?model=GLMGridResults__9a29646b78dd988aacd4f88e4d864ccd_1&path=adfs&force=1
        for job_key, grid_key in jobs:
            gridResult = h2o.nodes[0].glm_grid_view(grid_key=grid_key)
            h2o_glm.simpleCheckGLMGrid(self, gridResult, **kwargs)

        print "All GLMGrid jobs completed in", elapsed, "seconds."
        print "totalGLMGridJobs:", totalGLMGridJobs