def createTestTrain(srcKey, trainDstKey, testDstKey, percent, outputClass, numCols):
    # will have to live with random extract. will create variance
    print "train: get random %. change class 4 to 1, everything else to 0. factor() to turn real to int (for rf)"


    # Create complexity for no good reason!. Do the same thing 5 times in the single exec expressions
    execExpr = ""
    STUPID_REPEAT = 20
    for i in range(STUPID_REPEAT):
        execExpr += "a.hex=runif(%s);" % srcKey
        execExpr += "%s=%s[a.hex%s,];" % (trainDstKey, srcKey, '<=0.9')
        if not DO_MULTINOMIAL:
            execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, numCols, trainDstKey, numCols, outputClass)
            execExpr +=  "factor(%s[, %s]);" % (trainDstKey, numCols)

    h2o_exec.exec_expr(None, execExpr, resultKey=trainDstKey, timeoutSecs=STUPID_REPEAT * 15)

    inspect = h2o_cmd.runInspect(key=trainDstKey)
    h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) )

    print "test: same, but use the same runif() random result, complement"

    execExpr = "a.hex=runif(%s);" % srcKey
    execExpr += "%s=%s[a.hex%s,];" % (testDstKey, srcKey, '>0.9')
    if not DO_MULTINOMIAL:
        execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, numCols, testDstKey, numCols, outputClass)
        execExpr +=  "factor(%s[, %s])" % (testDstKey, numCols)
    h2o_exec.exec_expr(None, execExpr, resultKey=testDstKey, timeoutSecs=10)

    inspect = h2o_cmd.runInspect(key=testDstKey)
    h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (testDstKey, srcKey) )
Exemplo n.º 2
0
    def test_GLM2_covtype_single_cols(self):
        h2o.beta_features = True
        timeoutSecs = 120
        csvPathname = "standard/covtype.data"
        print "\n" + csvPathname

        # columns start at 0
        y = 54
        ignore_x = ""
        parseResult = h2i.import_parse(
            bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key="A.hex", timeoutSecs=15
        )

        case = 2
        execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1, case)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        print "GLM binomial ignoring 1 X column at a time"
        print "Result check: abs. value of coefficient and intercept returned are bigger than zero"
        for colX in xrange(1, 53):
            if ignore_x == "":
                ignore_x = "C" + str(colX)
            else:
                # x = x + "," + str(colX)
                ignore_x = "C" + str(colX)

            sys.stdout.write(".")
            sys.stdout.flush()
            print "y:", y

            start = time.time()
            kwargs = {"ignored_cols": ignore_x, "response": y, "n_folds": 6}
            glm = h2o_cmd.runGLM(parseResult={"destination_key": "A.hex"}, timeoutSecs=timeoutSecs, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, "took", time.time() - start, "seconds"
Exemplo n.º 3
0
    def test_exec2_covtype_cols(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key='c.hex', timeoutSecs=30)
        print "\nParse key is:", parseResult['destination_key']

        ### h2b.browseTheCloud()
        start = time.time()
        # passes with suffix, fails without?
        # suffix = ""
        suffix = ".hex"
        for k in range(54):
            # try the funky c(6) thing like  R, instead of just 6
            execExpr = "Result" + str(k) + suffix + " = c.hex[,c(" + str(k+1) + ")]"
            print "execExpr:", execExpr
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result" + str(k) + suffix, 
                timeoutSecs=4)
            for node in h2o.nodes:
                storeView = h2o_cmd.runStoreView(node=node, noPrint=True)
                numKeys = len(storeView['keys'])
                # number of keys should = k + 2? (on each node)
                self.assertEqual(k + 2, numKeys, "# of keys: %s on %s doesn't match expected: %s" % \
                    (numKeys, node, k + 2))
                    # (numKeys, node, k+2, h2o.dump_json(storeView)))

        h2o.check_sandbox_for_errors()
        print "exec end on ", "covtype.data" , 'took', time.time() - start, 'seconds'
Exemplo n.º 4
0
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, 
    csvSrcOutputPathname, csvPredictPathname, 
    skipSrcOutputHeader, skipPredictHeader,
    translate=None, y=0):
    # have to slice out col 0 (the output) and feed result to predict
    # cols are 0:784 (1 output plus 784 input features
    # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
    dataKey = "P.hex"
    h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
    if skipSrcOutputHeader:
        print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
        print "hack for now, can't chop out col 0 in Exec currently"
        dataKey = hex_key
    else:
        print "No header in dataset, can't chop out cols, since col numbers are used for names"
        dataKey = hex_key

    # +1 col index because R-like
    h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30)

    start = time.time()
    predict = h2o.nodes[0].generate_predictions(model_key=model_key,
        data_key=hex_key, destination_key=predictHexKey)
    print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds'
    h2o.check_sandbox_for_errors()
    inspect = h2o_cmd.runInspect(key=predictHexKey)
    h2o_cmd.infoFromInspect(inspect, 'predict.hex')

    h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
    h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
    h2o.check_sandbox_for_errors()

    print "Do a check of the original output col against predicted output"
    (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
        msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
    (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname,
        msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

    # no header on source
    if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
        raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
            %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

    wrong = 0
    for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
        # if float(o)!=float(p):
        if str(o)!=str(p):
            if wrong==10:
                print "Not printing any more mismatches\n"
            elif wrong<10:
                msg = "Comparing original output col vs predicted. row %s differs. \
                    original: %s predicted: %s"  % (rowNum, o, p)
                print msg
            wrong += 1

    print "\nTotal wrong:", wrong
    print "Total:", len(originalOutput)
    pctWrong = (100.0 * wrong)/len(originalOutput)
    print "wrong/Total * 100 ", pctWrong
    return pctWrong
Exemplo n.º 5
0
    def test_exec2_operators(self):
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        csvPathname = 'standard/covtype.data'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
        # h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10)
        for (execExpr, num) in exprList:
            start = time.time()
            resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180)
            print h2o.dump_json(resultExec)
            print 'exec end took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key='a.hex')
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            print "numCols:", numCols
            print "numRows:", numRows
            self.assertEqual(numCols, 1)
            self.assertEqual(numRows, num)

            h2o.check_sandbox_for_errors()
Exemplo n.º 6
0
    def test_exec2_reduction(self):
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        if getpass.getuser()=='jenkins':
            csvPathname = 'standard/billion_rows.csv.gz'
        else:
            csvPathname = '1B/reals_100000x1000_15f.data'
            csvPathname = '1B/reals_1B_15f.data'
            csvPathname = '1B/reals_1000000x1000_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2)
        inspect = h2o_cmd.runInspect(key=hex_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

        for execExpr in initList:
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
            print "result:", result

        for execExpr in exprList:
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
            print 'exec took', time.time() - start, 'seconds'
            print "result:", result
            assert result==1

        h2o.check_sandbox_for_errors()
Exemplo n.º 7
0
    def test_exec2_na_chop(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'airlines/year2013.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        inspect = h2o_cmd.runInspect(key='i.hex')
        print "\nr.hex" \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        numRows1 = inspect['numRows']
        numCols = inspect['numCols']

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, keyX='s.hex', maxTrials=200, timeoutSecs=30, maxCol=numCols-1)

        inspect = h2o_cmd.runInspect(key='s.hex')
        print "\ns.hex" \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        numRows2 = inspect['numRows']

        print numRows1, numRows2


        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Exemplo n.º 8
0
    def test_exec2_result_race(self):
        ### h2b.browseTheCloud()

        lenNodes = len(h2o.nodes)
        # zero the list of Results using node[0]
        # FIX! is the zerolist not eing seen correctl? is it not initializing to non-zero?
        for execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result.hex", timeoutSecs=20)
            ### print "\nexecResult:", execResult

        trial = 0
        while (trial < 200):
            for execExpr in exprList:
                # for the first 100 trials: do each expression at node 0,
                # for the second 100 trials: do each expression at a random node, to facilate key movement
                # FIX! there's some problem with the initList not taking if rotated amongst nodes?
                if (trial < 100):
                    nodeX = 0
                else:
                    nodeX = random.randint(0,lenNodes-1)
                
                resultKey = "Result.hex"
                execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr,
                    resultKey=resultKey, timeoutSecs=20)

                print min_value, execExpr
                h2o.verboseprint("min_value: ", min_value, "trial:", trial)

                ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                trial += 1
    def test_rapids_funs_basic2(self):
        if 1==1:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'
        else:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for trial in range(5):
            for execExpr in funsList:
                funs = '[%s]' % execExpr
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, 
                    timeoutSecs=4)
                execExpr2 = '(= !junk (apply %r1 #2 %anon))' 
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, 
                    timeoutSecs=15)
                # rows might be zero!
                if execResult['num_rows'] or execResult['num_cols']:
                    keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Exemplo n.º 10
0
    def test_GLM2_params_rand2(self):
        csvPathname = 'covtype/covtype.20k.data'

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="covtype.20k")

        CLASS = 1
        # make a binomial version 
        execExpr="B.hex=%s; B.hex[,%s]=(B.hex[,%s]==%s)" % ('covtype.20k', 54+1, 54+1, CLASS)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        paramDict = define_params()
        for trial in range(20):
            # params is mutable. This is default.
            params = {
                'response': 54, 
                'alpha': 0.1, 
                # 'lambda': 1e-4, 
                'lambda': 0,
                'n_folds': 1,
            }
            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            if 'family' not in kwargs or kwargs['family']=='binomial':
                bHack = {'destination_key': 'B.hex'}
            else:
                bHack = parseResult
            
            start = time.time()
            glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=bHack, **kwargs)
            # pass the kwargs with all the params, so we know what we asked for!
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            h2o.check_sandbox_for_errors()
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Exemplo n.º 11
0
    def test_exec2_fast_locks(self):
        csvPathname = 'iris/iris2.csv'
        src_key='iris.csv'
        if not AVOID_BUG:
            # need the key name (pattern) to feed to parse)
            (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                src_key=src_key, timeoutSecs=10)
            # just as a reminder of what these returns look like
            print "importResult:", h2o.dump_json(importResult)
            print "importPattern:", h2o.dump_json(importPattern)
        y = 4

        for trial in range (1, 100):
            if AVOID_BUG:
                # need the key name (pattern) to feed to parse)
                (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                    src_key=src_key, timeoutSecs=10)
                # just as a reminder of what these returns look like
                print "importResult:", h2o.dump_json(importResult)
                print "importPattern:", h2o.dump_json(importPattern)

            # make sure each parse is unique dest key (not in use)
            hex_key = "iris2_" + str(trial) + ".hex"
            # what if we kicked off another parse without waiting for it? I think the src key gets locked
            # so we'd get lock issues on the src_key
            parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key,
                delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10)
            execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)
            
        # just show the jobs still going, if any. maybe none, because short (iris)
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
Exemplo n.º 12
0
    def test_exec2_operators4(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)

        # now run them just concatenating each time. We don't do any template substitutes, so don't need
        # exec_expr_list_rand()
        
        bigExecExpr = ""
        expCnt = 0

        for t in range(200):
            execExpr = random.choice(exprList)
            bigExecExpr += execExpr + ";"
            h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4)
            expCnt += 1
            # limit to 2 expressions. 
            # Also: functions must be solitary
            # Also: ifelse() must be solitary
            # Also: ternary operators must be solitary
            if expCnt > 2 or 'function' in execExpr or 'ifelse' in execExpr or "?" in execExpr:
                bigExecExpr = ""
                expCnt = 0
                

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Exemplo n.º 13
0
    def test_exec2_poppush_fail(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        exprList = []
        while (len(exprList)!=20):
            exprs = [random.choice(phrases) for j in range(random.randint(1,2))]
            # check if we have mean2() before function defn
            functionFound = False
            for e in exprs:
                if 'function' in e:
                    functionFound = True
                if 'mean2' in e and not functionFound:
                    # add the function definition first
                    exprs = ["mean2=function(x){apply(x,1,sum)/nrow(x)};"] + exprs
            exprList.append("".join(exprs))

        # add this one for good measure (known fail)
        exprList += "r.hex-r.hex; mean2=function(x){apply(x,1,sum)/nrow(x)}; mean2(r.hex); r.hex[,ncol(r.hex)+1]=4;"

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)

        for execExpr in exprList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)
Exemplo n.º 14
0
    def test_rapids_basic_with_funs_noinc(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for i in range(100):
            if i==0:
                # should never see v as a key from the function?
                execExpr1 = '(= !v1 (c {#0}))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5)
                execExpr2 = '(= !v2 (cbind %v1 ))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5)
            else:
                # adding to v shouldn't hurt, but not required cause function output will update it
                # execExpr1 = '(= !v (+ %v #1))'
                # execExpr1 = '(+ %v #1)'
                # add to itself?
                execExpr1 = '(+ %v %v)'
                funs = '[(def anon {v} %s;;;)]' % execExpr1
                execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True)
                # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))'
                # execExpr2 = '(= !v2 (anon %v2))'
                execExpr2 = '(= !v2 (+ %v2 #1))'
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15)


            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr2)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # FIX! check if v is ever there.

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1==0: 
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s", (k, storeView['keys']))
            else: 
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Exemplo n.º 15
0
    def test_GLM2_model_key_unique(self):
        h2o.beta_features = True
        modelKeyDict = {}
        for trial in range (1,5):
            csvPathname = 'iris/iris2.csv'
            start = time.time()
                        # make sure each parse is unique dest key (not in use
            hex_key = "iris2_" + str(trial) + ".hex"
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', 
                hex_key=hex_key, timeoutSecs=10)
            y = 4
            execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            
            # h2o.py now sets destination_key for a fixed default model name, 
            # we want h2o to create model names for this test, so use none here
            kwargs = {'destination_key': None, 'response':4, 'family': 'gaussian'}
            glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, noPoll=True, **kwargs )
            print "GLM #%d" % trial,  "started on ", csvPathname, 'took', time.time() - start, 'seconds'

            model_key = glmResult['destination_key']
            print "GLM model_key:", model_key
            if model_key in modelKeyDict:
                raise Exception("same model_key used in GLM #%d that matches prior GLM #%d" % (trial, modelKeyDict[model_key]))
            modelKeyDict[model_key] = trial

        # just show the jobs still going, if any. maybe none, because short (iris)
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
Exemplo n.º 16
0
    def test_rapids_vec_fail1(self):
        start = time.time()
        xList = []
        eList = []
        fList = []

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        for trial in range(int(1e6),int(100e6),int(10e6)):
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'
            execExpr = '(= !v (+ %v %v))'
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30)
            elapsed2 = time.time() - start

            if execResult['num_rows']:
                keys.append(execExpr)
            
            xList.append(length)
            eList.append(elapsed1)
            fList.append(elapsed2)


        if 1==1:
            xLabel = 'vector length'
            eLabel = 'elapsed (create v)'
            fLabel = 'elapsed (v = v + v)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Exemplo n.º 17
0
def createTestTrain(srcKey, trainDstKey, testDstKey, trainPercent, 
    outputClass=None, outputCol=None, changeToBinomial=False):
    # will have to live with random extract. will create variance

    print "train: get random", trainPercent
    print "test: get remaining", 100 - trainPercent
    if changeToBinomial:
        print "change class", outputClass, "to 1, everything else to 0. factor() to turn real to int (for rf)"

    boundary = (trainPercent + 0.0)/100

    execExpr = ""
    execExpr += "cct.hex=runif(%s,-1);" % srcKey
    execExpr += "%s=%s[cct.hex<=%s,];" % (trainDstKey, srcKey, boundary)
    if changeToBinomial:
        execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, outputCol+1, trainDstKey, outputCol+1, outputClass)
        execExpr +=  "factor(%s[, %s]);" % (trainDstKey, outputCol+1)

    h2o_exec.exec_expr(None, execExpr, resultKey=trainDstKey, timeoutSecs=30)

    inspect = runInspect(key=trainDstKey)
    infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) )

    print "test: same, but use the same runif() random result, complement comparison"

    execExpr = ""
    execExpr += "%s=%s[cct.hex>%s,];" % (testDstKey, srcKey, boundary)
    if changeToBinomial:
        execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, outputCol+1, testDstKey, outputCol+1, outputClass)
        execExpr +=  "factor(%s[, %s])" % (testDstKey, outputCol+1)
    h2o_exec.exec_expr(None, execExpr, resultKey=testDstKey, timeoutSecs=30)

    inspect = runInspect(key=testDstKey)
    infoFromInspect(inspect, "%s after mungeDataset on %s" % (testDstKey, srcKey) )
Exemplo n.º 18
0
    def test_GLM2_covtype_single_cols(self):
        timeoutSecs = 120
        csvPathname = 'standard/covtype.data'
        print "\n" + csvPathname

        # columns start at 0
        y = 54
        ignore_x = ""
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', 
            hex_key='A.hex', timeoutSecs=15)

        case = 2
        execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, case)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        print "GLM binomial ignoring 1 X column at a time" 
        print "Result check: abs. value of coefficient and intercept returned are bigger than zero"
        for colX in xrange(1,53):
            if ignore_x == "": 
                ignore_x = 'C' + str(colX)
            else:
                # x = x + "," + str(colX)
                ignore_x = 'C' + str(colX)

            sys.stdout.write('.')
            sys.stdout.flush() 
            print "y:", y

            start = time.time()
            kwargs = {'ignored_cols': ignore_x, 'response': y, 'n_folds': 6 }
            glm = h2o_cmd.runGLM(parseResult={'destination_key': 'A.hex'}, timeoutSecs=timeoutSecs, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
Exemplo n.º 19
0
def execit(n, bucket, path, src_key, hex_key, timeoutSecs=60, retryDelaySecs=1, pollTimeoutSecs=30):
    np1 = (n+1) % len(h2o.nodes)
    np = (n) % len(h2o.nodes)
    # doesn't work cause we can't have racing writers
    # execExpr = "r2 = (r2==%s) ? %s+1 : %s" % (np1, np1)
    if np == 0:
        if READ_ONLY:
            execExpr = "(r%s==1) ? c(1) : c(0);" % np
        else:
            execExpr = "r%s = c(1)" % np1
        print "Sending request to node: %s" % h2o.nodes[np1],
        h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30)
    else:
        # flip to one if the prior value is 1 (unless you're the zero case
        if READ_ONLY:
            execExpr = "(r%s==1) ? c(1) : c(0);" % np
        else:
            execExpr = "r%s = (r%s==1) ? c(1) : c(0);" % (np1, np)
        print "Sending request to node: %s" % h2o.nodes[np1],
        (resultExec, fpResult) = h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30)
        while fpResult != 1:
            print "to node: %s" % h2o.nodes[np1]
            (resultExec, fpResult) = h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30)

    hex_key = np1
    return hex_key
Exemplo n.º 20
0
    def test_exec2_ddply_phrases(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'standard/covtype.data'
        csvPathname = "standard/covtype.shuffled.10pct.data"

        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey)


        for col in range(1,10):
            initList = [
                ('r.hex', 'r.hex=i.hex'),
                (None, "func1=function(x){max(x[,%s])}" % col),
                (None, "func2=function(x){a=3;nrow(x[,%s])*a}" % col),
                (None, "func3=function(x){apply(x[,%s],2,sum)/nrow(x[,%s])}" % (col, col) ),
                # (None, "function(x) { cbind( mean(x[,1]), mean(x[,%s]) ) }" % col),
                (None, "func4=function(x) { mean( x[,%s]) }" % col), 
                (None, "func5=function(x) { sd( x[,%s]) }" % col), 
                # (None, "func6=function(x) { quantile(x[,%s] , c(0.9) ) }" % col),
            ]
            for resultKey, execExpr in initList:
                h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60)

            for p in phrases:
                execExpr = "ddply(r.hex, c(2), " + p + ")" 
                h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
Exemplo n.º 21
0
    def test_exec2_operators2(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
        start = time.time()
        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)

        # now run them just concatenating each time. We don't do any template substitutes, so don't need
        # exec_expr_list_rand()
        
        bigExecExpr = ""
        expCnt = 0
        for execExpr in exprList:
            bigExecExpr += execExpr + ";"
            h2e.exec_expr(h2o.nodes[0], bigExecExpr, resultKey=None, timeoutSecs=4)
            expCnt += 1
            # limit to 5 expressions and see what happens
            if expCnt > 2:
                bigExecExpr = ""
                expCnt = 0
                

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
Exemplo n.º 22
0
    def test_50_nongz_fvec(self):
        avgMichalSize = 237270000
        bucket = "home-0xdiag-datasets"
        importFolderPath = "manyfiles-nflx-gz"
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            # ("*[1][0][0].dat", "file_1_A.dat", 1 * avgMichalSize, 1800),
            ("*[1][0-4][0-9].dat.gz", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern
            hex_key = csvFilename + ".hex"

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            importFullList = importResult["files"]
            importFailList = importResult["fails"]
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            parseResult = h2i.import_parse(
                bucket=bucket, path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=600
            )
            execExpr = "A.hex=%s" % parseResult["destination_key"]
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

            h2o_cmd.runStoreView(timeoutSecs=60)
Exemplo n.º 23
0
    def test_exec2_poppush2_fail(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        exprList = []
        while (len(exprList)!=20):
            exprs = [random.choice(phrases) for j in range(random.randint(1,2))]
            # check if we have mean2() before function defn
            functionFound = False
            for i, e in enumerate(exprs):
                if 'function' in e:
                    functionFound = True
                    # h2o has problems with assigns after functions
                
            if functionFound and len(exprs)> 1:
                # pass
                exprList.append("".join(exprs))
            else:
                exprList.append("".join(exprs))


        # add this one for good measure (known fail)
        # exprList += "crunk=function(x){x+98};r.hex[,3]=4;"
        exprList += ["function(x){x+98};r.hex[,3]=4;"]

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)

        for execExpr in exprList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)
Exemplo n.º 24
0
    def test_exec2_multi_node(self):
        h2o.beta_features = True
        for n, node in enumerate(h2o.nodes):
            print "n:", n
            np1 = (n+1) % len(h2o.nodes)
            np = n % len(h2o.nodes)

            # get this key known to this node
            print "Init with independent targets. No shared target"
            execExpr = "r%s = c(0)" % np1
            print "Sending request to node: %s" % h2o.nodes[np1]
            h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30)

            # test the store expression
            execExpr = "(r%s==0)" % np1
            print "Sending request to node: %s" % h2o.nodes[np1]
            h2e.exec_expr(node=h2o.nodes[np1], execExpr=execExpr, timeoutSecs=30)

        global OUTSTANDING
        if not OUTSTANDING:
            OUTSTANDING = min(10, len(h2o.nodes))

        execTrial = 0
        worker_resultq = multiprocessing.Queue()
        while execTrial <= TRIALMAX:
            start = time.time()
            workers = []
            for o in range(OUTSTANDING):
                np = execTrial % len(h2o.nodes)
                retryDelaySecs = 5
                timeoutSecs = 60
                bucket = None
                csvPathname = None
                src_key = None
                hex_key = 'a'
                tmp = multiprocessing.Process(target=function_no_keyboard_intr,
                    args=(worker_resultq, execit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs))
                tmp.start()
                workers.append(tmp)
                execTrial += 1

            # Exec doesn't get tracked as a job. So can still have outstanding
            # now sync on them
            for worker in workers:
                try:
                    # this should synchronize
                    worker.join()
                    print "worker joined:", worker
                    # don't need him any more
                    worker.terminate()
                    hex_key = worker_resultq.get(timeout=2)
                except KeyboardInterrupt:
                    print 'parent received ctrl-c'
                    for worker in workers:
                        worker.terminate()
                        worker.join()
            elapsed = time.time() - start
            print "Group end at #", execTrial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        def doAll(case):
            keys = []
            trial = 0
            for execExpr in exprList:
                # 4x4 cases per expression
                colons = [
                    # requires only 1 value on rhs
                    '#0 #0',
                    # '"null" #0',
                    # '#0 "null"',

                    # '"null" "null"',
                ]
                for colon in colons:
                    # what if the destination doesn't exist?. Use unique name for each, to see
                    t = "t%s" % trial
                    cases = [
                        # no colon 
                        '(= !{} {})'.format(t, execExpr),
                        # colon lhs
                        # '(= ([ %%s %s) %s)' % (t, colon, execExpr),
                        # colon rhs
                        # '(= !%s  ([ %s %s))' % (t, execExpr, colon),
                        # colon lhs and rhs
                        '(= ([ %{} {}) ([ {} {}))'.format(t, colon, execExpr, colon),
                    ]

                    for case in cases:
                        # init the data frame first to 0 (1 row, 1 col) 
                        print "\nt:", t, "case:", case
                        # can't init it to empty
                        '(= !%s (c {#0})' % t
                        execResult, result = h2e.exec_expr(h2o.nodes[0], case, resultKey=None, timeoutSecs=4)

                        # colonize it, to see if it blows up!
                        # since they all are assigns, they all are wrapped by '(= !<lhs> ...)
                        # unwrap the inner and wrap it with a colon then wrap it with the assign
                        # change the lhs to be coloned (row and/or col) and change the rhs to be a colon
                        # so four cases
                        # make sure the lhs assign key exists first
                        execResult, result = h2e.exec_expr(h2o.nodes[0], case, resultKey=None, timeoutSecs=4)
                        # rows/cols could be zero
                        # if execResult['num_rows'] or execResult['num_cols']:
                        # I think if key is not null, then that means a key got created
                        # oh, but exec deletes ones with leading "_" immediately? those are temp keys
                        # we'll put them in the list and see if we see them
                        if execResult['key']:
                            keys.append(execExpr)
                        trial += 1


                print "\nExpressions that created keys"
                for k in keys:
                    print k
                    if re.match('_', k):
                        raise Exception("%s I didn't expect any keys with leading underscores." +
                            "\nDoesn't spencer delete those so I can't read them?" % k)

                h2o.check_sandbox_for_errors()
Exemplo n.º 26
0
    def test_parse_cust(self):
        # run as user 0xcustomer to get access (with .json config and ssh key file specified)
        importFolderPath = '/mnt/0xcustomer-datasets'
        pollTimeoutSecs = 120
        retryDelaySecs = 30
        timeoutSecs = 300
        
        (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*")
        importFileList = importResult['files']
        importFailList = importResult['fails']
        importKeyList = importResult['keys']
        importDelList = importResult['dels']

        if len(importDelList)!=0:
            raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList))

        if len(importFileList)<MINFILES:
            raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList))

        if len(importKeyList)<MINFILES:
            raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList))

        if len(importFailList)!=0:
            raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList))


        # only parse files with .csv or .tsv in their name (no dirs like that?)
        goodKeyList = [key for key in importKeyList if ('.csv' in key  or '.tsv' in key)]
        trial = 0
        # just do 1?
        for i, importKey in enumerate(random.sample(goodKeyList,3)):
            print "importKey:", importKey
            trial +=1

            start = time.time() 
            # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like
            # force header=0..should mean headers get treated as NAs
            parseResult = h2i.parse_only(pattern=importKey, header=0,
                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "Parse result['destination_key']:", parseResult['destination_key']

            origKey = parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=origKey)
            h2o_cmd.infoFromInspect(inspect, origKey)

            execExpr = 'newKey = '+origKey+'[1,1]'
            h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
            newParseKey = {'destination_key': 'newKey'}

            h2o_cmd.checkKeyDistribution()
            h2o.nodes[0].remove_key(key=origKey)
            # a key isn't created for a scalar
            # h2o.nodes[0].remove_key(key='newKey')
        
        self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
Exemplo n.º 27
0
 def test_B_claim_prediction_binomial(self):
     csvPathname = 'allstate/claim_prediction_train_set_10000_int.csv.gz'
     kwargs = {'family': 'binomial', 'response': 'Claim_Amount', 'alpha': 0, 'lambda': 0.5, 'max_iter': 15}
     parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key='A.hex')
     execExpr = 'A.hex[,35] = A.hex[,35]>100'
     h2o_exec.exec_expr(execExpr=execExpr)
     parseResult['destination_key'] = 'A.hex'
     glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=150, **kwargs)
     h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Exemplo n.º 28
0
    def test_exec2_cbind_fail2(self):

        for i in range(5):
            execExpr = "a=c(0,0,0); b=c(0,0,0)"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            execExpr = "h <- cbind(a, b)"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        h2o.check_sandbox_for_errors()
Exemplo n.º 29
0
    def test_parse_manyfiles_1(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):
            
            csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # PARSE****************************************
            hex_key =  csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema=SCHEMA, hex_key=hex_key,
                delete_on_done=DELETE_ON_DONE, 
                # importParentDir=IMPORT_PARENT_DIR,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "parse", trial, "end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(numCols, 542)
            self.assertEqual(numRows, 100000)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            # goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            # convert to binomial
            if DO_EXEC:
                execExpr="A.hex=%s" % parseResult['destination_key']
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)

                # execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                # h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)

            if DO_DELETE_MYSELF:
                h2o_import.delete_keys_at_all_nodes()

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Exemplo n.º 30
0
    def test_exec2_multi_node3(self):

        for initTrial in range(1):
            for node in h2o.nodes:
                # get this key known to this node
                execExpr = "r0 = c(0,0); r1 = c(0,0); r2 = c(0,0);"
                print "Sending request to node: %s" % node
                h2e.exec_expr(node=node, execExpr=execExpr, timeoutSecs=30)

                if TEST_MUX_STORE:
                    # test the store expression
                    execExpr = "(r1==c(0,0)) ? c(0,0) : c(1,1)"
                    print "Sending request to node: %s" % node
                    h2e.exec_expr(node=node, execExpr=execExpr, timeoutSecs=30)

        global OUTSTANDING
        if not OUTSTANDING:
            OUTSTANDING = min(10, len(h2o.nodes))

        execTrial = 0
        worker_resultq = multiprocessing.Queue()
        while execTrial <= TRIALMAX:
            start = time.time()
            workers = []
            for o in range(OUTSTANDING):
                np = execTrial % len(h2o.nodes)
                retryDelaySecs = 5
                timeoutSecs = 60
                bucket = None
                csvPathname = None
                src_key = None
                hex_key = 'a'
                tmp = multiprocessing.Process(target=function_no_keyboard_intr,
                    args=(worker_resultq, execit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs))
                tmp.start()
                workers.append(tmp)
                execTrial += 1

            # Exec doesn't get tracked as a job. So can still have outstanding
            # now sync on them
            for worker in workers:
                try:
                    # this should synchronize
                    worker.join()
                    print "worker joined:", worker
                    # don't need him any more
                    worker.terminate()
                    hex_key = worker_resultq.get(timeout=2)
                except KeyboardInterrupt:
                    print 'parent received ctrl-c'
                    for worker in workers:
                        worker.terminate()
                        worker.join()
            elapsed = time.time() - start
            print "Group end at #", execTrial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Exemplo n.º 31
0
    def test_GLM2_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        # H2O might not do whitespace stripping on numbers correctly, when , is {SEP}
        # GLM will auto expand categoricals..so if we have more coefficients than expected
        # that means it didn't parse right
        # mix in space/tab combos
        # just done like this for readability
        rowDataTrueRaw = \
            "<sp>1,\
            0<sp>,\
            <tab>65,\
            1<tab>,\
            <sp><tab>2,\
            1<sp><tab>,\
            <tab><sp>1,\
            4<tab><sp>,\
            <tab><tab>1,\
            4<tab><tab>,\
            <sp><sp>1,\
            4<sp><sp>"

        rowDataTrue = re.sub("<sp>"," ", rowDataTrueRaw)
        rowDataTrue = re.sub("<tab>","  ", rowDataTrue)

        rowDataFalse = \
            "0,\
            1,\
            0,\
            -1,\
            -2,\
            -1,\
            -1,\
            -4,\
            -1,\
            -4,\
            -1,\
            -3"

        twoValueList = [
            # (0,1,0, 12),
            # (0,1,1, 12),
            # ('A','B',0, 12),
            # ('A','B',1, 12),
            (-1,1,-1, 12),
            (-1,1,1, 12),
            (-1e1,1e1,1e1, 12),
            (-1e1,1e1,-1e1, 12),
            ]

        trial = 0
        for (outputTrue, outputFalse, case, expectedCoeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, 
                rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            hex_key = csvFilename + "_" + str(trial)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)

            # maybe go back to simpler exec here. this was from when Exec failed unless this was used
            execExpr="A.hex=%s" % hex_key
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (13, 13, case)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            aHack = {'destination_key': 'A.hex'}

            start = time.time()
            kwargs = {
                'n_folds': 0,
                'response': 'C13', 
                'family': 'binomial', 
                'alpha': 0.0, 
                'lambda': 0, 
                'beta_epsilon': 0.0002
            }

            # default takes 39 iterations? play with alpha/beta
            print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse)
            glm = h2o_cmd.runGLM(parseResult=aHack, **kwargs)
            (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            # check that the number of entries in coefficients is right (12 with intercept)

            coefficients_names = glm['glm_model']['coefficients_names']
            print "coefficients_names:", coefficients_names

            # subtract one for intercept
            actualCoeffNum = len(glm['glm_model']['submodels'][0]['beta']) - 1
            if (actualCoeffNum!=expectedCoeffNum):
                raise Exception("Should be %s expected coefficients in result. actual: %s" % (expectedCoeffNum, actualCoeffNum))

            print "trial #", trial, "glm end on ", csvFilename, 'took', time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            trial += 1
Exemplo n.º 32
0
def predict_and_compare_csvs(model_key,
                             hex_key,
                             predictHexKey,
                             csvSrcOutputPathname,
                             csvPredictPathname,
                             skipSrcOutputHeader,
                             skipPredictHeader,
                             translate=None,
                             y=0):
    # have to slice out col 0 (the output) and feed result to predict
    # cols are 0:784 (1 output plus 784 input features
    # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
    dataKey = "P.hex"
    h2e.exec_expr(execExpr=dataKey + "=" + hex_key,
                  timeoutSecs=30)  # unneeded but interesting
    if skipSrcOutputHeader:
        print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
        print "hack for now, can't chop out col 0 in Exec currently"
        dataKey = hex_key
    else:
        print "No header in dataset, can't chop out cols, since col numbers are used for names"
        dataKey = hex_key

    # +1 col index because R-like
    h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) + "]",
                  timeoutSecs=30)

    start = time.time()
    predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                data_key=hex_key,
                                                destination_key=predictHexKey)
    print "generate_predictions end on ", hex_key, " took", time.time(
    ) - start, 'seconds'
    h2o.check_sandbox_for_errors()
    inspect = h2o_cmd.runInspect(key=predictHexKey)
    h2o_cmd.infoFromInspect(inspect, 'predict.hex')

    h2o.nodes[0].csv_download(src_key="Z.hex",
                              csvPathname=csvSrcOutputPathname)
    h2o.nodes[0].csv_download(src_key=predictHexKey,
                              csvPathname=csvPredictPathname)
    h2o.check_sandbox_for_errors()

    print "Do a check of the original output col against predicted output"
    (rowNum1,
     originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
                                              msg="Original",
                                              colIndex=0,
                                              translate=translate,
                                              skipHeader=skipSrcOutputHeader)
    (rowNum2,
     predictOutput) = compare_csv_at_one_col(csvPredictPathname,
                                             msg="Predicted",
                                             colIndex=0,
                                             skipHeader=skipPredictHeader)

    # no header on source
    if ((rowNum1 - skipSrcOutputHeader) != (rowNum2 - skipPredictHeader)):
        raise Exception(
            "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
            %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

    wrong = 0
    for rowNum, (o, p) in enumerate(zip(originalOutput, predictOutput)):
        # if float(o)!=float(p):
        if str(o) != str(p):
            if wrong == 10:
                print "Not printing any more mismatches\n"
            elif wrong < 10:
                msg = "Comparing original output col vs predicted. row %s differs. \
                    original: %s predicted: %s" % (rowNum, o, p)
                print msg
            wrong += 1

    print "\nTotal wrong:", wrong
    print "Total:", len(originalOutput)
    pctWrong = (100.0 * wrong) / len(originalOutput)
    print "wrong/Total * 100 ", pctWrong
    return pctWrong
Exemplo n.º 33
0
    def test_exec2_xorsum2(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(3):
            ullResultList = []
            NUM_FORMAT_CASES = h2o_util.fp_format()
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname

                sel = random.randint(0, NUM_FORMAT_CASES-1)
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE, sel)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                inspect = h2o_cmd.runInspect(key=hex_key)
                print "numRows:", inspect['numRows']
                print "numCols:", inspect['numCols']
                inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
                print "inspect offset = -1:", h2o.dump_json(inspect)

                
                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for repeate in range(3):
                        start = time.time()
                        (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, 
                            resultKey=None, timeoutSecs=300)
                        print 'exec took', time.time() - start, 'seconds'
                        print "execResult:", h2o.dump_json(execResult)
                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way. needed when integers are parsed

                        # okay for a couple of lsbs to be wrong, due to conversion from stringk
                        # ullResult (0.16x): 0x02c1a21f923cee96   2.15698793923e-295
                        # expectedUllSum (0.16x): 0x02c1a21f923cee97   2.15698793923e-295
                        # expectedFpSum (0.16x): 0x42f054af32b3c408   2.87294442126e+14

                        # ullResult and expectedUllSum are Q ints, (64-bit) so can subtract them.
                        # I guess we don't even care about sign, since we zero the first 4 bits (xorsum) to avoid nan/inf issues
                        ALLOWED_BIT_ERR = 0x1f # seeing this amount of error!
                        if ullResult!=expectedUllSum and (abs(ullResult-expectedUllSum)>ALLOWED_BIT_ERR):
                            emsg = "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)
                            if STOP_ON_ERROR:
                                raise Exception(emsg)
                            else:  
                                print emsg

                        # print "%30s" % "hex(bitResult):", hex(ullResult)

                    h2o.check_sandbox_for_errors()

                    print "first result was from a sum. others are xorsum"
                    print "ullResultList:"
                    for ullResult, fpResult in ullResultList:
                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                    print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                    print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
Exemplo n.º 34
0
    def test_exec2_quantile_na_scalar(self):
        h2o.beta_features = True
        for execExpr in initList:
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=180)

        for (execExpr, num) in exprList:
            start = time.time()
            resultExec, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=180)
            print 'exec end took', time.time() - start, 'seconds'
            h2p.blue_print("h2o exec quantiles result:", result)
            self.assertEqual(
                result,
                expectedP,
                msg="Checking exec quantiles median, expectedP: %s result: %s"
                % (expectedP, result))
            print h2o.dump_json(resultExec)
            # do the quantiles page on the created key
            kwargs = {
                'column': 0,
                'quantile': QUANTILE,
                'multiple_pass': 2,
                'max_qbins': 1000,
            }
            q = h2o.nodes[0].quantiles(source_key='ddd', **kwargs)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertEqual(qresult_iterations,
                             3,
                             msg="should take 3 iterations")

            # self.assertEqual(qresult_interpolated, True, msg="Should say it's interpolating")

            self.assertEqual(
                qresult,
                expectedP,
                msg="Checking quantilespage median, expectedP: %s result: %s" %
                (expectedP, qresult))

            inspect = h2o_cmd.runInspect(key='abc')
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            print "numCols:", numCols
            print "numRows:", numRows
            self.assertEqual(numCols, 1)
            self.assertEqual(numRows, num)

            h2o.check_sandbox_for_errors()
Exemplo n.º 35
0
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000
        avgMichalSize = 116561140
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        if 1 == 0:
            importFolderPath = '/home2/0xdiag/datasets'
            print "Using non-.gz'ed files in", importFolderPath
            csvFilenameAll = [
                # I use different files to avoid OS caching effects
                ("manyfiles-nflx/file_1.dat", "file_1.dat",
                 1 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat",
                 10 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat",
                 20 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat",
                 50 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                ("onefile-nflx/file_1_to_100.dat", "file_single.dat",
                 100 * avgMichalSizeUncompressed, 1200),
            ]
        if 1 == 1:
            importFolderPath = '/home/0xdiag/datasets'
            print "Using .gz'ed files in", importFolderPath
            # all exactly the same prior to gzip!
            # could use this, but remember import folder -> import folder s3 for jenkins?
            # how would it get it right?
            # os.path.getsize(f)
            csvFilenameAll = [
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700),
                # 100 files takes too long on two machines?
                # ("covtype200x.data", "covtype200x.data", 15033863400, 700),
                # I use different files to avoid OS caching effects
                ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700),
                # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700),
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz",
                 1 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz",
                 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz",
                 20 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz",
                 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_*.dat.gz", "file_100.dat.gz",
                 100 * avgMichalSize, 1200),

                # do it twice
                # ("covtype.data", "covtype.data"),
                # ("covtype20x.data", "covtype20x.data"),
                # "covtype200x.data",
                # "100million_rows.csv",
                # "200million_rows.csv",
                # "a5m.csv",
                # "a10m.csv",
                # "a100m.csv",
                # "a200m.csv",
                # "a400m.csv",
                # "a600m.csv",
                # "billion_rows.csv.gz",
                # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        tryHeap = 10
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        noPoll = False
        benchmarkLogging = ['cpu', 'disk', 'iostats', 'jstack']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for (csvFilepattern, csvFilename, totalBytes,
             timeoutSecs) in csvFilenameList:
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(2,
                                java_heap_GB=tryHeap,
                                base_port=base_port,
                                enable_benchmark_log=True)
            else:
                h2o_hosts.build_cloud_with_hosts(1,
                                                 java_heap_GB=tryHeap,
                                                 base_port=base_port,
                                                 enable_benchmark_log=True)
            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2

            for trial in range(trialMax):
                importFolderResult = h2i.setupImportFolder(
                    None, importFolderPath)
                importFullList = importFolderResult['succeeded']
                importFailList = importFolderResult['failed']
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                    importFailList)
                # creates csvFilename.hex from file in importFolder dir

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message(
                    "Parse " + csvFilename +
                    " Start--------------------------------")
                start = time.time()
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilepattern,
                    importFolderPath,
                    key2=csvFilename + ".hex",
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                if noPoll:
                    if (i + 1) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2,
                         timeoutSecs) = csvFilenameList[i + 1]
                        s3nKey = URI + "/" + csvFilepattern
                        key2 = csvFilename + "_" + str(trial) + ".hex"
                        print "Loading", protocol, "key:", s3nKey, "to", key2
                        parse2Key = h2o.nodes[0].parse(
                            s3nKey,
                            key2,
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                    if (i + 2) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3,
                         timeoutSecs) = csvFilenameList[i + 2]
                        s3nKey = URI + "/" + csvFilepattern
                        key2 = csvFilename + "_" + str(trial) + ".hex"
                        print "Loading", protocol, "key:", s3nKey, "to", key2
                        parse3Key = h2o.nodes[0].parse(
                            s3nKey,
                            key2,
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                                          timeoutSecs=timeoutSecs,
                                          benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                if totalBytes is not None:
                    fileMBS = (totalBytes / 1e6) / elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response'][
                    'time']
                print "Parse result['destination_key']:", parseKey[
                    'destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.check_enums_from_inspect(parseKey)

                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)'
                execExpr = 'a = slice(' + origKey + ',1,200)'
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                h2o_cmd.check_key_distribution()
                h2o_cmd.delete_csv_key(csvFilename, importFullList)
                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush()
Exemplo n.º 36
0
    def test_rapids_cbind_vec(self):

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 10
        # for trial in range(maxx):
        # for trial in range(int(1e6),int(200e6),int(1e6)):
        for trial in [int(10e6)]:
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'
            # cols = 100
            xList = []
            eList = []
            fList = []
            for trial2 in range(0, 5):
            # for trial2 in range(0, 10):
            # fails. Post size?
            # for trial2 in range(0, 16):
                col = 2 ** trial2
                # assert col < 16384, "h2o can't take col == 16384 or more"
             
                vString = ' '.join(['%v' for x in range(col)])
                execExpr = '(= !v2 (cbind %s))' % vString

                # FIX! check the colnames. 2 cols get C1 and C10? odd 
                # try:
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40)
                elapsed2 = time.time() - start

                if execResult['num_rows']:
                    keys.append(execExpr)
                
                # except:
                #     elapsed2 = 0
                #     h2p.red_print("ERROR: col = %s failed" % col)

                if 1==0:
                    start = time.time()
                    execExpr = '(sum %v2 %TRUE)'
                    execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
                    elapsed1 = time.time() - start

                # xList.append(length)
                xList.append(col)
                eList.append(elapsed1)
                fList.append(elapsed2)


        if 1==1:
            xLabel = 'col'
            eLabel = 'elapsed (sum)'
            fLabel = 'elapsed (cbind cols)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Exemplo n.º 37
0
    def test_rf_covtype_train_oobe3(self):
        print "\nUse randomFilter to sample the dataset randomly. then slice it"
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"

        h2i.setupImportFolder(None, importFolderPath)
        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=100)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = num_rows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79]
        expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        dataKeyTrain = "rTrain"

        # FIX! too many digits (10) in the 2nd param seems to cause stack trace
        execExpr = dataKeyTest + "=randomFilter(" + key2 + "," + str(pct10) + ",12345)"
        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10)

        execExpr = dataKeyTrain + "=randomFilter(" + key2 + "," + str(rowsForPct[9]) + ",12345)"
        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10)

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]
        
        for trial in range(1,10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 
            resultKey = "r" + str(trial)
            execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(rowsToUse) + ")"
            # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)"
            h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10)
            parseKey['destination_key'] = resultKey

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            start = time.time()
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + str(trial)
            
            rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%"
            # pop the stuff from kwargs that were passing as params
            model_key = rfv['model_key']
            kwargs.pop('model_key',None)

            data_key = rfv['data_key']
            kwargs.pop('data_key',None)

            ntree = rfv['ntree']
            kwargs.pop('ntree',None)
            kwargs['iterative_cm'] = 1
            # do full scoring
            kwargs['out_of_bag_error_estimate'] = 0
            rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree,
                timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs)

            h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)

            fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error'])
            self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                    ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows"

        actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp
Exemplo n.º 38
0
    def test_exec2_cbind_fail3(self):

        for i in range(5):
            execExpr = "h <- cbind(c(0,0,0), c(1,1,1))"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            # have to make sure they're created as keys for reuse between execs
            execExpr = "a=c(0,0,0); b=c(0,0,0); d=c(0,0,0); e=c(0,0,0); f=c(0,0,0); g= c(0,0,0);"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            execExpr = "b=a; d=a; f=a; g=a;"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            execExpr = "h <- cbind(a, b)"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            execExpr = "h <- cbind(a, b, d)"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            execExpr = "h <- cbind(a, b, d, e)"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            execExpr = "h <- cbind(a, b, d, e, f)"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            execExpr = "h <- cbind(a, b, d, e, f, g)"
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        h2o.check_sandbox_for_errors()
    def test_GLM2_covtype_train_predict_all_all(self):
        importFolderPath = "standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180)

        execExpr="A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 1) # class 1
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        print "Use same data (full) for train and test"
        trainDataKey = "A.hex"
        testDataKey = "A.hex"
        # start at 90% rows + 1
        
        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y+1),
            'max_iter': 20, 
            'n_folds': 0, 
            # 'alpha': 0.1, 
            # 'lambda': 1e-5, 
            'alpha': 0.0,
            'lambda': None,
            'family': 'binomial',
        }
        timeoutSecs = 60

        for trial in range(1):
            # test/train split **********************************************8
            aHack = {'destination_key': trainDataKey}

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            modelKey = glm['glm_model']['_key']
            submodels = glm['glm_model']['submodels']
            # hackery to make it work when there's just one
            validation = submodels[-1]['validation']
            best_threshold = validation['best_threshold']
            thresholds = validation['thresholds']

            # have to look up the index for the cm, from the thresholds list
            best_index = None
            for i,t in enumerate(thresholds):
                if t == best_threshold:
                    best_index = i
                    break
            cms = validation['_cms']
            cm = cms[best_index]
            trainPctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=testDataKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y+1),
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            self.assertEqual(pctWrong, trainPctWrong,"Should see the same error rate on train and predict? (same data set)")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed"
Exemplo n.º 40
0
    def test_parse_covtype_loop_fvec(self):
        h2o.beta_features = True
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        # don't raise exception if we find something bad in h2o stdout/stderr?
        # h2o.nodes[0].sandboxIgnoreErrors = True
        global OUTSTANDING
        if not OUTSTANDING:
            OUTSTANDING = min(10, len(h2o.nodes))

        if DO_IRIS:
            global DO_BIGFILE
            DO_BIGFILE = False
            bucket = 'smalldata'
            importFolderPath = "iris"
            csvFilename = "iris2.csv"
            csvFilePattern = "iris2.csv"
            if localhost:
                trialMax = 20
            else:
                trialMax = 100
        elif DO_BIGFILE:
            bucket = 'home-0xdiag-datasets'
            importFolderPath = "standard"
            csvFilename = "covtype20x.data"
            csvFilePattern = "covtype20x.data"
            trialMax = 2 * OUTSTANDING
        else:
            bucket = 'home-0xdiag-datasets'
            importFolderPath = "standard"
            csvFilename = "covtype.data"
            csvFilePattern = "covtype.data"
            trialMax = 40 * OUTSTANDING

        # add one just to make it odd
        # OUTSTANDING = min(10, len(h2o.nodes) + 1)
        # don't have more than one source file per node OUTSTANDING? (think of the node increment rule)
    
        # okay to reuse the src_key name. h2o deletes? use unique hex to make sure it's not reused.
        # might go to unique src keys also ..oops have to, to prevent complaints about the key (lock)
        # can't repeatedly import the folder

        # only if not noPoll. otherwise parse isn't done
        # I guess I have to use 'put' so I can name the src key unique, to get overlap
        # I could tell h2o to not delete, but it's nice to get the keys in a new place?
        # maybe rebalance? FIX! todo

        parseTrial = 0
        summaryTrial = 0
        uploader_resultq = multiprocessing.Queue()
        while parseTrial <= trialMax:
            start = time.time()
            uploaders = []
            if not DO_IRIS:
                assert OUTSTANDING<=10 , "we only have 10 links with unique names to covtype.data"
            for o in range(OUTSTANDING):
                src_key = csvFilename + "_" + str(parseTrial) 
                hex_key = csvFilename + "_" + str(parseTrial) + ".hexxx"
                # "key": "hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", 

                # hacked hard ln so source keys would have different names? was getting h2o locking issues
                if DO_IRIS:
                    csvPathname = importFolderPath + "/" + csvFilePattern
                else:
                    csvPathname = importFolderPath + "/" + csvFilePattern + "_" + str(o)
                start = time.time()

                # walk the nodes
                # if this rule is matched for exec/summary below, it should find the name okay? (npe with xorsum)
                # summary2 not seeing it?
                np = parseTrial % len(h2o.nodes)
                retryDelaySecs=5 if DO_BIGFILE else 1
                timeoutSecs=60 if DO_BIGFILE else 15
                tmp = multiprocessing.Process(target=function_no_keyboard_intr,
                    args=(uploader_resultq, uploadit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs))
                tmp.start()
                uploaders.append(tmp)
                parseTrial += 1

            # now sync on them
            for uploader in uploaders:
                try:
                    uploader.join()
                    # don't need him any more
                    uploader.terminate()
                    (importPattern, hex_key) = uploader_resultq.get(timeout=2)
                except KeyboardInterrupt:
                    print 'parent received ctrl-c'
                    for uploader in uploaders:
                        uploader.terminate()
                        uploader.join()
            elapsed = time.time() - start
            print "Parse group end at #", parseTrial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        print "We might have parses that haven't completed. The join just says we can reuse some files (parse still going)"
        if PARSE_NOPOLL:
            h2o_jobs.pollWaitJobs(timeoutSecs=180)

        h2o_cmd.runStoreView()
        # h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=0.25)

        if DO_PARSE_ALSO: # only if we parsed
            print "These all go to node [0]"
            # getting a NPE if I do xorsum (any exec?) ..just do summary for now..doesn't seem to have the issue
            # suspect it's about the multi-node stuff above
            for summaryTrial in range(trialMax):

                # do last to first..to get race condition?
                firstXorUll = None
                firstQuantileUll = None
                hex_key = csvFilename + "_" + str(summaryTrial) + ".hexxx"
                
                if DO_EXEC_QUANT:
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, thresholds)
                    (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                    print "%30s" % "median ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                    if firstQuantileUll:
                        self.assertEqual(ullResult, firstQuantileUll)
                    else:
                        firstQuantileUll = ullResult

                if DO_XORSUM:
                    execExpr = "r2=c(1); r2=xorsum(%s[,1], c(%s));" % (hex_key, thresholds)
                    (resultExec, fpResult) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                    print "%30s" % "xorsum ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                    if firstXorUll:
                        self.assertEqual(ullResult, firstXorUll)
                    else:
                        firstXorUll = ullResult

                if DO_SUMMARY:
                    h2o_cmd.runSummary(key=hex_key)
Exemplo n.º 41
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # remove the output too! (378)
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    # execExpr="A.hex=%s" % parseResult['destination_key']
                    # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
Exemplo n.º 42
0
    def test_ddply_plot(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (1000000, 5, 'cD', 0, 10, 30),
            (1000000, 5, 'cD', 0, 20, 30),
            (1000000, 5, 'cD', 0, 30, 30),
            (1000000, 5, 'cD', 0, 40, 30),
            (1000000, 5, 'cD', 0, 50, 30),
            (1000000, 5, 'cD', 0, 70, 30),
            (1000000, 5, 'cD', 0, 100, 30),
            (1000000, 5, 'cD', 0, 130, 30),
            (1000000, 5, 'cD', 0, 160, 30),
            # (1000000, 5, 'cD', 0, 320, 30),
            # starts to fail here. too many groups?
            # (1000000, 5, 'cD', 0, 640, 30),
            # (1000000, 5, 'cD', 0, 1280, 30),
        ]

        ### h2b.browseTheCloud()
        xList = []
        eList = []
        fList = []
        trial = 0
        for (rowCount, colCount, hex_key, minInt, maxInt,
             timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'

            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname, "with range", (maxInt -
                                                                 minInt) + 1
            write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt,
                              SEEDPERFILE)

            # PARSE train****************************************
            hexKey = 'r.hex'
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hexKey)

            for resultKey, execExpr in initList:
                h2e.exec_expr(h2o.nodes[0],
                              execExpr,
                              resultKey=resultKey,
                              timeoutSecs=60)

            # do it twice..to get the optimal cached delay for time?
            execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=60)
            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed

            execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                 execExpr,
                                                 resultKey=None,
                                                 timeoutSecs=60)
            groups = execResult['num_rows']
            maxExpectedGroups = ((maxInt - minInt) + 1)**2
            h2o_util.assertApproxEqual(
                groups,
                maxExpectedGroups,
                rel=0.2,
                msg="groups %s isn't close to expected amount %s" %
                (groups, maxExpectedGroups))

            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed
            print "execResult", h2o.dump_json(execResult)

            # should be same answer in both cases
            execExpr = "d=sum(a1!=a2)==0"
            (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                 execExpr,
                                                 resultKey=None,
                                                 timeoutSecs=60)
            print "execResult", h2o.dump_json(execResult)
            self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result)

            # xList.append(ntrees)
            trial += 1
            # this is the biggest it might be ..depends on the random combinations
            # groups = ((maxInt - minInt) + 1) ** 2
            xList.append(groups)
            eList.append(ddplyElapsed)
            fList.append(ddplyElapsed)

        if DO_PLOT:
            xLabel = 'groups'
            eLabel = 'ddplyElapsed'
            fLabel = 'ddplyElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
Exemplo n.º 43
0
    def test_GLM2_covtype_train(self):
        h2o.beta_features = True
        importFolderPath = "standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"

        # Parse and Exec************************************************
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180)

        execExpr="A.hex=%s" % parseResult['destination_key']
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict
        # will have to live with random extract. will create variance
        # class 4 = 1, everything else 0
        y = 54
        execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        inspect = h2o_cmd.runInspect(key="A.hex")
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        # Split Test/Train************************************************
        # how many rows for each pct?
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        print "Creating the key of the last 10% data, for scoring"
        trainDataKey = "rTrain"
        testDataKey = "rTest"
        # start at 90% rows + 1
        
        # GLM, predict, CM*******************************************************8
        kwargs = {
            'response': 'C' + str(y+1),
            'max_iter': 20, 
            'n_folds': 0, 
            'alpha': 0.1, 
            'lambda': 1e-5, 
            'family': 'binomial',
        }
        timeoutSecs = 180

        for trial in range(10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 

            # test/train split **********************************************8
            h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90)
            aHack = {'destination_key': trainDataKey}
            parseKey = trainDataKey

            # GLM **********************************************8
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            modelKey = glm['glm_model']['_key']

            # Score **********************************************
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=testDataKey,
                model_key=modelKey,
                destination_key=predictKey,
                timeoutSecs=timeoutSecs)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=testDataKey,
                vactual='C' + str(y+1),
                predict=predictKey,
                vpredict='predict',
                )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm);
            self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)")

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
Exemplo n.º 44
0
    def test_ddply_plot(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DO_KNOWN_FAIL:
            tryList = [
                (1000000, 5, 'cD', 0, 320, 30), 
            ]
        else:
            tryList = [
                # (1000000, 5, 'cD', 0, 10, 30), 
                (1000000, 5, 'cD', 0, 20, 30), 
                # (1000000, 5, 'cD', 0, 40, 30), 
                (1000000, 5, 'cD', 0, 50, 30), 
                # (1000000, 5, 'cD', 0, 80, 30), 
                (1000000, 5, 'cD', 0, 160, 30), 
                # fails..don't do
                # (1000000, 5, 'cD', 0, 320, 30), 
                # (1000000, 5, 'cD', 0, 320, 30), 
                # starts to fail here. too many groups?
                # (1000000, 5, 'cD', 0, 640, 30), 
                # (1000000, 5, 'cD', 0, 1280, 30), 
                ]

        if DO_APPEND_KNOWN_FAIL2:
            tryList.append(
                (1000000, 5, 'cD', 0, 160, 30), 
            )
            tryList.append(
                (1000000, 5, 'cD', 0, 320, 30), 
            )
        ### h2b.browseTheCloud()
        xList = []
        eList = []
        fList = []
        trial = 0
        for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            if DO_KNOWN_FAIL:
                # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails
                # csvFilename = 'a1' # fails
                csvFilename = "syn_ddply_1Mx5_0_320.gz"
                bucket = "home-0xdiag-datasets"
                csvPathname = "standard/" + csvFilename
                minInt = 0
                maxInt = 320
            else:
                bucket = None
                csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                print "Creating random", csvPathname, "with range", (maxInt-minInt)+1
                write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE)

            for lll in range(1):
                # PARSE train****************************************
                hexKey = 'r.hex'
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey)
                inspect = h2o_cmd.runInspect(key=hexKey)
                missingValuesList = h2o_cmd.infoFromInspect(inspect, csvFilename)
                self.assertEqual(missingValuesList, [], "a1 should have no NAs in parsed dataset: %s" % missingValuesList)

                for resultKey, execExpr in initList:
                    h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60)

                #*****************************************************************************************
                # two columns. so worse case every combination of each possible value
                # only true if enough rows (more than the range?)
                maxExpectedGroups = ((maxInt - minInt) + 1) ** 2
                # do it twice..to get the optimal cached delay for time?
                execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")"
                start = time.time()
                (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                groups = execResult['num_rows']
                # this is a coarse comparision, statistically not valid for small rows, and certain ranges?
                h2o_util.assertApproxEqual(groups, maxExpectedGroups,  rel=0.2, 
                    msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt))
                ddplyElapsed = time.time() - start
                print "ddplyElapsed:", ddplyElapsed
                print "execResult", h2o.dump_json(execResult)

                a1dump = h2o_cmd.runInspect(key="a1")
                print "a1", h2o.dump_json(a1dump)
                # should never have any NAs in this result
                missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1")
                self.assertEqual(missingValuesList, [], "a1 should have no NAs: %s trial: %s" % (missingValuesList, trial))

                #*****************************************************************************************

                execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")"
                start = time.time()
                (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                groups = execResult['num_rows']
                # this is a coarse comparision, statistically not valid for small rows, and certain ranges?
                h2o_util.assertApproxEqual(groups, maxExpectedGroups,  rel=0.2, 
                    msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s" % (groups, maxExpectedGroups, minInt, maxInt))
                ddplyElapsed = time.time() - start
                print "ddplyElapsed:", ddplyElapsed
                print "execResult", h2o.dump_json(execResult)

                a2dump = h2o_cmd.runInspect(key="a2")
                print "a2", h2o.dump_json(a2dump)
                # should never have any NAs in this result
                missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2")
                self.assertEqual(missingValuesList, [], "a2 should have no NAs: %s trial: %s" % (missingValuesList, trial))

                #*****************************************************************************************
                # should be same answer in both cases
                execExpr = "sum(a1!=a2)==0"
                (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                execExpr = "s=c(0); s=(a1!=a2)"
                (execResult1, result1) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=120)
                print "execResult", h2o.dump_json(execResult)

                #*****************************************************************************************

                # should never have any NAs in this result
                sdump = h2o_cmd.runInspect(key="s")
                print "s", h2o.dump_json(sdump)
                self.assertEqual(result, 1, "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s" % (FUNC_PHRASE, result, h2o.dump_json(execResult)))

                # xList.append(ntrees)
                trial += 1
                # this is the biggest it might be ..depends on the random combinations
                # groups = ((maxInt - minInt) + 1) ** 2
                xList.append(groups)
                eList.append(ddplyElapsed)
                fList.append(ddplyElapsed)

        if DO_PLOT:
            xLabel = 'groups'
            eLabel = 'ddplyElapsed'
            fLabel = 'ddplyElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Exemplo n.º 45
0
    def test_exec2_multi_node(self):
        h2o.beta_features = True
        for n, node in enumerate(h2o.nodes):
            print "n:", n
            np1 = (n + 1) % len(h2o.nodes)
            np = n % len(h2o.nodes)

            # get this key known to this node
            print "Init with independent targets. No shared target"
            execExpr = "r%s = c(0)" % np1
            print "Sending request to node: %s" % h2o.nodes[np1]
            h2e.exec_expr(node=h2o.nodes[np1],
                          execExpr=execExpr,
                          timeoutSecs=30)

            # test the store expression
            execExpr = "(r%s==0) ? c(0) : c(1)" % np1
            print "Sending request to node: %s" % h2o.nodes[np1]
            h2e.exec_expr(node=h2o.nodes[np1],
                          execExpr=execExpr,
                          timeoutSecs=30)

        global OUTSTANDING
        if not OUTSTANDING:
            OUTSTANDING = min(10, len(h2o.nodes))

        execTrial = 0
        worker_resultq = multiprocessing.Queue()
        while execTrial <= TRIALMAX:
            start = time.time()
            workers = []
            for o in range(OUTSTANDING):
                np = execTrial % len(h2o.nodes)
                retryDelaySecs = 5
                timeoutSecs = 60
                bucket = None
                csvPathname = None
                src_key = None
                hex_key = 'a'
                tmp = multiprocessing.Process(
                    target=function_no_keyboard_intr,
                    args=(worker_resultq, execit, np, bucket, csvPathname,
                          src_key, hex_key, timeoutSecs, retryDelaySecs))
                tmp.start()
                workers.append(tmp)
                execTrial += 1

            # Exec doesn't get tracked as a job. So can still have outstanding
            # now sync on them
            for worker in workers:
                try:
                    # this should synchronize
                    worker.join()
                    print "worker joined:", worker
                    # don't need him any more
                    worker.terminate()
                    hex_key = worker_resultq.get(timeout=2)
                except KeyboardInterrupt:
                    print 'parent received ctrl-c'
                    for worker in workers:
                        worker.terminate()
                        worker.join()
            elapsed = time.time() - start
            print "Group end at #", execTrial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Exemplo n.º 46
0
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0,
                                            15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                            -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                   1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                         1.00)),
            (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                          100.0)),
            (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                            7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                             100, 00)),
            (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                             75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18,
                                             49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999
            ]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount,
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i != 0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (
                        hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" %
                                    h2o.dump_json(resultExec))
                    h2p.blue_print(
                        "\nthreshold: %.2f Exec quantile: %s Summary2: %s" %
                        (threshold, result, pt[i]))
                    if not result:
                        raise Exception(
                            "exec result: %s for quantile: %s is bad" %
                            (result, threshold))
                    h2o_util.assertApproxEqual(
                        result,
                        pctile[i],
                        tol=maxDelta,
                        msg=
                        'exec percentile: %s too different from expected: %s' %
                        (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1 == 0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2')
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols, 1)
                    self.assertEqual(numRows, len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                )

            h2o.nodes[0].remove_all_keys()
Exemplo n.º 47
0
    def test_rapids_vec_fail1(self):
        start = time.time()
        xList = []
        eList = []
        fList = []

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        for trial in range(int(1e6), int(100e6), int(10e6)):

            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)

            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'
            execExpr = '(= !v (+ %v %v))'
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=30)
            elapsed2 = time.time() - start

            if execResult['num_rows']:
                keys.append(execExpr)

            xList.append(length)
            eList.append(elapsed1)
            fList.append(elapsed2)

        if 1 == 1:
            xLabel = 'vector length'
            eLabel = 'elapsed (create v)'
            fLabel = 'elapsed (v = v + v)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_basic_with_funs_noinc(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        keys = []
        for i in range(100):
            if i == 0:
                # should never see v as a key from the function?
                execExpr1 = '(= !v1 (c {#0}))'
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   execExpr1,
                                                   resultKey='v1',
                                                   timeoutSecs=5)
                execExpr2 = '(= !v2 (cbind %v1 ))'
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   execExpr2,
                                                   resultKey='v2',
                                                   timeoutSecs=5)
            else:
                # adding to v shouldn't hurt, but not required cause function output will update it
                # execExpr1 = '(= !v (+ %v #1))'
                # execExpr1 = '(+ %v #1)'
                # add to itself?
                execExpr1 = '(+ %v %v)'
                funs = '[(def anon {v} %s;;;)]' % execExpr1
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   funs,
                                                   resultKey=None,
                                                   timeoutSecs=5,
                                                   doFuns=True)
                # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))'
                # execExpr2 = '(= !v2 (anon %v2))'
                execExpr2 = '(= !v2 (+ %v2 #1))'
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   execExpr2,
                                                   resultKey='v2',
                                                   timeoutSecs=15)

            # see if the execExpr had a lhs assign. If so, it better be in the storeview
            r = re.search('![a-zA-Z0-9]+', execExpr2)
            if r:
                lhs = r.group(0)[1:]
                print "Found key lhs assign", lhs

                # FIX! check if v is ever there.

                # KeyIndexeds gets too many rollup stats problems. Don't use for now
                if 1 == 0:
                    inspect = h2o_cmd.runInspect(key=lhs)
                    missingList, labelList, numRows, numCols = infoFromInspect(
                        inspect)

                    storeview = h2o_cmd.runStoreView()
                    print "\nstoreview:", dump_json(storeview)
                    if not k in storeView['keys']:
                        raise Exception("Expected to find %s in %s",
                                        (k, storeView['keys']))
            else:
                print "No key lhs assign"

            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr2)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
Exemplo n.º 49
0
    def test_exec2_col_add(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        if localhost:
            # csvPathname = '1B/reals_100000x1000_15f.data'
            # csvPathname = '1B/reals_1000000x1000_15f.data'
            csvPathname = '1B/reals_1000000x1_15f.data'
            # csvPathname = '1B/reals_1B_15f.data'
            # csvPathname = '1B/reals_100M_15f.data'
        else:
            # csvPathname = '1B/reals_100000x1000_15f.data'
            # csvPathname = '1B/reals_1000000x1000_15f.data'
            csvPathname = '1B/reals_1000000x1_15f.data'
            # csvPathname = '1B/reals_1B_15f.data'
            # csvPathname = '1B/reals_100M_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=3000,
                                       retryDelaySecs=2,
                                       doSummary=False)
        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']
        inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
        print "inspect offset = -1:", h2o.dump_json(inspect)

        xList = []
        eList = []
        fList = []
        for execExpr in initList:
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=300)
        for trial in range(1000):
            for execExpr in exprList:
                # put the trial number into the temp for uniqueness
                execExpr = re.sub('Last.value', 'Last.value%s' % trial,
                                  execExpr)
                execExpr = re.sub(',1', ',%s' % trial, execExpr)
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0],
                                                   execExpr,
                                                   resultKey=None,
                                                   timeoutSecs=300)
                execTime = time.time() - start
                print 'exec took', execTime, 'seconds'
                c = h2o.nodes[0].get_cloud()
                c = c['nodes']

                # print (h2o.dump_json(c))
                k = [i['num_keys'] for i in c]
                v = [i['value_size_bytes'] for i in c]

                print "keys: %s" % " ".join(map(str, k))
                print "value_size_bytes: %s" % " ".join(map(str, v))

                # print "result:", result
                if ('r1' in execExpr) and (not 'apply' in execExpr):
                    xList.append(trial)
                    eList.append(execTime)
                if ('apply' in execExpr):
                    fList.append(execTime)

        h2o.check_sandbox_for_errors()
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'time: r1[,1] = Last.value = r2',
            fLabel = 'time: apply(r1, 2, sum)',
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList,
                              xLabel,
                              eListTitle,
                              eList,
                              eLabel,
                              fListTitle,
                              fList,
                              fLabel,
                              server=True)
Exemplo n.º 50
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)
                h2o_cmd.checkKeyDistribution()

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                execExpr = 'B.hex=A.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'C.hex=B.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'D.hex=C.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()
Exemplo n.º 51
0
    def test_GLM2_mnist(self):
        if not SCIPY_INSTALLED:
            pass

        else:
            h2o.beta_features = True
            SYNDATASETS_DIR = h2o.make_syn_dir()

            csvFilelist = [
                (10000, 500, 'cA', 60),
            ]

            trial = 0
            for (rowCount, colCount, hex_key, timeoutSecs) in csvFilelist:
                trialStart = time.time()

                # PARSE test****************************************
                csvFilename = 'syn_' + "binary" + "_" + str(
                    rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount)

                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # GLM****************************************
                modelKey = 'GLM_model'
                y = colCount
                kwargs = {
                    'response': 'C' + str(y + 1),
                    'family': 'binomial',
                    'lambda': 1e-4,
                    'alpha': 0,
                    'max_iter': 15,
                    'n_folds': 1,
                    'beta_epsilon': 1.0E-4,
                    'destination_key': modelKey,
                }

                # GLM wants the output col to be strictly 0,1 integer
                execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==1" % (
                    hex_key, y + 1, y + 1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                aHack = {'destination_key': 'aHack'}

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=60,
                                     **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                lambdaMax = glm['glm_model']['lambda_max']
                print "lambdaMax:", lambdaMax

                best_threshold = glm['glm_model']['submodels'][0][
                    'validation']['best_threshold']
                print "best_threshold", best_threshold

                # pick the middle one?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][5][
                    '_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                print "\nPredict\n==========\n"
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key='aHack',
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='aHack',
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(pctWrong, 50, "Should see less than 50% error")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Exemplo n.º 52
0
    def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True):
        # the expected results are only for the shuffled version
        # since getting 10% samples etc of the smallish dataset will vary between
        # shuffled and non-shuffled datasets
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on", csvFilename
        parseKey = h2i.parseImportFolderFile(None,
                                             csvFilename,
                                             importFolderPath,
                                             key2=key2,
                                             header=0,
                                             timeoutSecs=180)

        inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0, 11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = num_rows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # 0 isn't used
        expectTrainPctRightList = [
            0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79
        ]
        expectScorePctRightList = [
            0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78
        ]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        # start at 90% rows + 1
        execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9] +
                                                                1) + ")"
        h2o_exec.exec_expr(None,
                           execExpr,
                           resultKey=dataKeyTest,
                           timeoutSecs=10)

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]

        # don't use the smaller samples..bad error rates, plus for sorted covtype, you can get just one class!
        for trial in range(8, 9):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial % 10]
            resultKey = "r_" + csvFilename + "_" + str(trial)
            execExpr = resultKey + " = slice(" + key2 + ",1," + str(
                rowsToUse) + ")"
            h2o_exec.exec_expr(None,
                               execExpr,
                               resultKey=resultKey,
                               timeoutSecs=10)
            # hack so the RF will use the sliced result
            # FIX! don't use the sliced bit..use the whole data for rf training below
            ### parseKey['destination_key'] = resultKey

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntree'] * 20
            # do oobe
            kwargs['out_of_bag_error_estimate'] = 1
            kwargs['model_key'] = "model_" + csvFilename + "_" + str(trial)
            # kwargs['model_key'] = "model"
            # double check the rows/cols
            inspect = h2o_cmd.runInspect(key=parseKey['destination_key'])
            h2o_cmd.infoFromInspect(inspect, "going into RF")

            start = time.time()
            rfv = h2o_cmd.runRFOnly(parseKey=parseKey,
                                    timeoutSecs=timeoutSecs,
                                    **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            oobeTrainPctRight = 100 * (
                1.0 - rfv['confusion_matrix']['classification_error'])
            if checkExpectedResults:
                self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                    msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%. Note this is silly if we trained on 100% of the data"
            print "Or sorted by output class, so that the last 10% is the last few classes"
            # pop the stuff from kwargs that were passing as params
            model_key = rfv['model_key']
            kwargs.pop('model_key', None)

            data_key = rfv['data_key']
            kwargs.pop('data_key', None)

            ntree = rfv['ntree']
            kwargs.pop('ntree', None)

            kwargs['iterative_cm'] = 1
            kwargs['no_confusion_matrix'] = 0

            # do full scoring
            kwargs['out_of_bag_error_estimate'] = 0
            # double check the rows/cols
            inspect = h2o_cmd.runInspect(key=dataKeyTest)
            h2o_cmd.infoFromInspect(inspect, "dataKeyTest")

            rfvScoring = h2o_cmd.runRFView(None,
                                           dataKeyTest,
                                           model_key,
                                           ntree,
                                           timeoutSecs,
                                           retryDelaySecs=1,
                                           print_params=True,
                                           **kwargs)

            h2o.nodes[0].generate_predictions(model_key=model_key,
                                              data_key=dataKeyTest)

            fullScorePctRight = 100 * (
                1.0 - rfvScoring['confusion_matrix']['classification_error'])

            if checkExpectedResults:
                self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                    msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (
                rowsToUse * 100.0 / num_rows), "pct. of all rows"

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectTrainPctRightList, actualTrainPctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectScorePctRightList, actualScorePctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        # return the last rfv done during training
        return rfv
Exemplo n.º 53
0
    def test_exec2_xorsum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax,
                 expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(
                    None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum,
                 expectedFpSum) = write_syn_dataset(csvPathname, rowCount,
                                                    colCount, expectedMin,
                                                    expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(
                    expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(
                    expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='local',
                                               hex_key=hex_key,
                                               timeoutSecs=3000,
                                               retryDelaySecs=2)
                inspect = h2o_cmd.runInspect(key=hex_key)
                print "numRows:", inspect['numRows']
                print "numCols:", inspect['numCols']
                inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
                print "inspect offset = -1:", h2o.dump_json(inspect)

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
                        start = time.time()
                        (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0],
                                                               execExpr,
                                                               resultKey=None,
                                                               timeoutSecs=300)
                        print r, 'exec took', time.time() - start, 'seconds'
                        print r, "execResult:", h2o.dump_json(execResult)
                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                            ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                            expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                            expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult != expectedUllSum:
                            raise Exception(
                                "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x"
                                % (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (
                                ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (
                        ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (
                    expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (
                    expectedFpSumAsLongLong, expectedFpSum)
Exemplo n.º 54
0
        def predict_and_compare_csvs(model_key,
                                     hex_key,
                                     predictHexKey,
                                     translate=None,
                                     y=0):
            # have to slice out col 0 (the output) and feed result to predict
            # cols are 0:784 (1 output plus 784 input features
            # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
            dataKey = "P.hex"
            if skipSrcOutputHeader:
                print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
                print "hack for now, can't chop out col 0 in Exec currently"
                dataKey = hex_key
            else:
                print "No header in dataset, can't chop out cols, since col numbers are used for names"
                dataKey = hex_key

            # +1 col index because R-like
            # FIX! apparently we lose the enum mapping when we slice out, and then csv download? we just get the number?
            # OH NO..it looks like we actually preserve the enum..it's in the csv downloaded
            # the prediction is the one that doesn't have it, because it's realated to clusters, which have no
            # notion of output classes
            h2e.exec_expr(execExpr="Z.hex=" + hex_key + "[," + str(y + 1) +
                          "]",
                          timeoutSecs=30)

            start = time.time()
            predictResult = h2o.nodes[0].generate_predictions(
                model_key=model_key,
                data_key=hexKey,
                destination_key=predictHexKey)
            print "generate_predictions end on ", hexKey, " took", time.time(
            ) - start, 'seconds'
            print "predictResult:", h2o.dump_json(predictResult)

            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key="Z.hex",
                                      csvPathname=csvSrcOutputPathname)
            h2o.nodes[0].csv_download(src_key=predictHexKey,
                                      csvPathname=csvPredictPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            (rowNum1, originalOutput) = compare_csv_at_one_col(
                csvSrcOutputPathname,
                msg="Original",
                colIndex=0,
                translate=translate,
                skipHeader=skipSrcOutputHeader)
            (rowNum2, predictOutput) = compare_csv_at_one_col(
                csvPredictPathname,
                msg="Predicted",
                colIndex=0,
                skipHeader=skipPredictHeader)

            # no header on source
            if ((rowNum1 - skipSrcOutputHeader) !=
                (rowNum2 - skipPredictHeader)):
                raise Exception(
                    "original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
                    %s" %
                    (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

            wrong = 0
            for rowNum, (o, p) in enumerate(zip(originalOutput,
                                                predictOutput)):
                # if float(o)!=float(p):
                if str(o) != str(p):
                    if wrong == 10:
                        print "Not printing any more mismatches\n"
                    elif wrong < 10:
                        msg = "Comparing original output col vs predicted. row %s differs. \
                            original: %s predicted: %s" % (rowNum, o, p)
                        print msg
                    wrong += 1

            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong) / len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?

            # hack..need to fix this
            if 1 == 0:
                if pctWrong > 2.0:
                    raise Exception(
                        "pctWrong too high. Expect < 2% error because it's reusing training data"
                    )
            return pctWrong
Exemplo n.º 55
0
    def sub_c2_nongz_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        if len(h2o.nodes) == 1:
            csvFilenameList = [
                ("*[1][0][0-9].dat", "file_10_A.dat", 10 * avgMichalSize, 600),
            ]
        else:
            csvFilenameList = [
                ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize,
                 1800),
                # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600),
            ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename +
                                     " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=csvFilename + ".hex",
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=retryDelaySecs,
                                           pollTimeoutSecs=pollTimeoutSecs,
                                           benchmarkLogging=benchmarkLogging)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # these are all the columns that are enums in the dataset...too many for GLM!
                x = range(542)  # don't include the output column
                # remove the output too! (378)
                ignore_x = []
                for i in [
                        3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20,
                        424, 425, 426, 540, 541
                ]:
                    x.remove(i)
                    ignore_x.append(i)

                # plus 1 because we are no longer 0 offset
                x = ",".join(map(lambda x: "C" + str(x + 1), x))
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    'ignored_cols': ignore_x,
                    'family': 'binomial',
                    'response': 'C379',
                    'max_iter': 4,
                    'n_folds': 1,
                    'family': 'binomial',
                    'alpha': 0.2,
                    'lambda': 1e-5
                }

                # convert to binomial
                execExpr = "A.hex=%s" % parseResult['destination_key']
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ('C379', 'C379', 15)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                aHack = {'destination_key': "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
Exemplo n.º 56
0
    def test_GLM2_mnist(self):
        h2o.beta_features = True
        if DO_HDFS:
            importFolderPath = "mnist"
            bucket = None
            schema = 'hdfs'
        else:
            importFolderPath = "mnist"
            bucket = 'home-0xdiag-datasets'
            schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema=schema,
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs)

            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTestResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTestResult['destination_key'],
                timeoutSecs=300,
                forRF=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema=schema,
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseTrainResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(
                y,
                key=parseTrainResult['destination_key'],
                timeoutSecs=300,
                forRF=True)
            print "ignoreX:", ignoreX

            modelKey = 'GLM_model'
            params = {
                'ignored_cols': ignoreX,
                'response': 'C' + str(y + 1),
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 15,
                ## 'thresholds': 0.5,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey,
            }

            if DO_ALL_DIGITS:
                cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
            else:
                cases = [8]

            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                # kwargs['case_val'] = c

                # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise)
                if DO_BUG:
                    execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (
                        trainKey, y + 1, y + 1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr = "A.hex=%s" % (trainKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % (y + 1, y + 1,
                                                                c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                if DO_BUG:
                    execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (
                        testKey, y + 1, y + 1, c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                else:
                    execExpr = "B.hex=%s" % (testKey)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                    execExpr = "B.hex[,%s]=(B.hex[,%s]==%s)" % (y + 1, y + 1,
                                                                c)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glmFirstResult = h2o_cmd.runGLM(parseResult=aHack,
                                                timeoutSecs=timeoutSecs,
                                                pollTimeoutSecs=60,
                                                noPoll=True,
                                                **kwargs)
                print "\nglmFirstResult:", h2o.dump_json(glmFirstResult)
                job_key = glmFirstResult['job_key']
                h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs,
                                            pollTimeoutSecs=60,
                                            retryDelaySecs=5)

                # double check...how come the model is bogus?
                h2o_jobs.pollWaitJobs()
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)

                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                modelKey = glm['glm_model']['_key']

                # This seems wrong..what's the format of the cm?
                cm = glm['glm_model']['submodels'][0]['validation']['_cms'][
                    -1]['_arr']
                print "cm:", cm
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # Score *******************************
                # this messes up if you use case_mode/case_vale above
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key='B.hex',
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual='B.hex',
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(pctWrong, 9,
                                "Should see less than 9% error (class = 4)")

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)
Exemplo n.º 57
0
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000
        avgMichalSize = 116561140
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize = 183
        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/10k_small_gz'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                ("00[0-4][0-9]_syn.csv.gz", "file_50.dat.gz", 50 * synSize, 700
                 ),
                ("[1][1][0-9][0-9]_.*", "file_100.dat.gz", 100 * synSize, 700),
                ("[1][0-4][0-9][0-9]_.*", "file_500.dat.gz", 500 * synSize,
                 700),
                ("[1][0-9][0-9][0-9]_.*", "file_1000.dat.gz", 1000 * synSize,
                 700),
                ("[0-4][0-9][0-9][0-9]_.*", "file_5000.dat.gz", 5000 * synSize,
                 700),
                ("[0-9][0-9][0-9][0-9]_.*", "file_10000.dat.gz",
                 10000 * synSize, 700),
            ]

        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800),
                ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz",
                 100 * avgMichalSize, 1800),
                ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz",
                 200 * avgMichalSize, 1800),
                ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz",
                 200 * avgMichalSize, 1800),
                ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz",
                 200 * avgMichalSize, 1800),
                ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz",
                 200 * avgMichalSize, 1800),
                # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600),
            ]

        if 1 == 1:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600),
                # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600),
                # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600),
                # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600),
                # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800),
                #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600),
                # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz",
                 100 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz",
                 100 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz",
                 120 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz",
                 120 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz",
                 140 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz",
                 140 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz",
                 160 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz",
                 160 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz",
                 180 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz",
                 200 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz",
                 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz",
                 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz",
                 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz",
                 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz",
                 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz",
                 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz",
                 400 * avgMichalSize, 3600),
                ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz",
                 400 * avgMichalSize, 3600),
            ]

        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/more1_300_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                ("*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
            ]

        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz",
                 300 * avgMichalSize, 3600),
                ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz",
                 50 * avgMichalSize, 3600),
            ]

        if 1 == 0:
            importFolderPath = '/home2/0xdiag/datasets'
            print "Using non-.gz'ed files in", importFolderPath
            csvFilenameAll = [
                # I use different files to avoid OS caching effects
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat",
                 100 * avgMichalSizeUncompressed, 700),
                # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200),
                # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700),
            ]
        if 1 == 0:
            importFolderPath = '/home/0xdiag/datasets'
            print "Using .gz'ed files in", importFolderPath
            # all exactly the same prior to gzip!
            # could use this, but remember import folder -> import folder s3 for jenkins?
            # how would it get it right?
            # os.path.getsize(f)
            csvFilenameAll = [
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700),
                # 100 files takes too long on two machines?
                # ("covtype200x.data", "covtype200x.data", 15033863400, 700),
                # I use different files to avoid OS caching effects
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700),
                # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700),
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz",
                 1 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz",
                 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz",
                 20 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz",
                 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz",
                 "file_100.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz",
                 "file_200.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz",
                 "file_300.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz",
                 100 * avgMichalSize, 1200),
                ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),

                # do it twice
                # ("covtype.data", "covtype.data"),
                # ("covtype20x.data", "covtype20x.data"),
                # "covtype200x.data",
                # "100million_rows.csv",
                # "200million_rows.csv",
                # "a5m.csv",
                # "a10m.csv",
                # "a100m.csv",
                # "a200m.csv",
                # "a400m.csv",
                # "a600m.csv",
                # "billion_rows.csv.gz",
                # "new-poker-hand.full.311M.txt.gz",
            ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu', 'disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk' 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks'
        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails'
        jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
        jea = ' -Dcom.sun.management.jmxremote.port=54330' + \
              ' -Dcom.sun.management.jmxremote.authenticate=false' + \
              ' -Dcom.sun.management.jmxremote.ssl=false'  + \
              ' -Dcom.sun.management.jmxremote' + \
              ' -Dcom.sun.management.jmxremote.local.only=false'
        jea = ' -Dlog.printAll=true'

        for i, (csvFilepattern, csvFilename, totalBytes,
                timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(
                    2,
                    java_heap_GB=tryHeap,
                    base_port=base_port,
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            else:
                h2o_hosts.build_cloud_with_hosts(
                    1,
                    java_heap_GB=tryHeap / 2,
                    base_port=base_port,
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2

            for trial in range(trialMax):
                importFolderResult = h2i.setupImportFolder(
                    None, importFolderPath)
                importFullList = importFolderResult['succeeded']
                importFailList = importFolderResult['failed']
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                    importFailList)
                # creates csvFilename.hex from file in importFolder dir

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message(
                    "Parse " + csvFilename +
                    " Start--------------------------------")
                start = time.time()
                parseKey = h2i.parseImportFolderFile(
                    None,
                    csvFilepattern,
                    importFolderPath,
                    key2=csvFilename + ".hex",
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                if noPoll:
                    if (i + 1) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2,
                         timeoutSecs) = csvFilenameList[i + 1]
                        parseKey = h2i.parseImportFolderFile(
                            None,
                            csvFilepattern,
                            importFolderPath,
                            key2=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                    if (i + 2) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3,
                         timeoutSecs) = csvFilenameList[i + 2]
                        parseKey = h2i.parseImportFolderFile(
                            None,
                            csvFilepattern,
                            importFolderPath,
                            key2=csvFilename + ".hex",
                            timeoutSecs=timeoutSecs,
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                                          timeoutSecs=timeoutSecs,
                                          benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                if totalBytes is not None:
                    fileMBS = (totalBytes / 1e6) / elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response'][
                    'time']
                print "Parse result['destination_key']:", parseKey[
                    'destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.check_enums_from_inspect(parseKey)

                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)'
                execExpr = 'a = slice(' + origKey + ',1,200)'
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542)  # don't include the output column
                    # remove the output too! (378)
                    for i in [
                            3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19,
                            20, 424, 425, 426, 540, 541, 378
                    ]:
                        x.remove(i)
                    x = ",".join(map(str, x))

                    GLMkwargs = {
                        'x': x,
                        'y': 378,
                        'case': 15,
                        'case_mode': '>',
                        'max_iter': 10,
                        'n_folds': 1,
                        'alpha': 0.2,
                        'lambda': 1e-5
                    }
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                             timeoutSecs=timeoutSecs,
                                             **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), tryHeap, csvFilepattern, csvFilename,
                        elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.check_key_distribution()
                h2o_cmd.delete_csv_key(csvFilename, importFullList)
                ### time.sleep(3600)
                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush()
Exemplo n.º 58
0
    def test_parse_200k_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 200, 200),
            (10, 1000, 'cB', 200, 200),
            (10, 1000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # import it N times and compare the N hex keys
            REPEAT = 5
            for i in range(REPEAT):
                hex_key_i = hex_key + "_" + str(i)

                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key_i,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
                print "Parse:", parseResult[
                    'destination_key'], "took", time.time() - start, "seconds"

                # We should be able to see the parse result?
                start = time.time()
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'],
                                             timeoutSecs=timeoutSecs2)
                print "Inspect:", parseResult[
                    'destination_key'], "took", time.time() - start, "seconds"
                h2o_cmd.infoFromInspect(inspect, csvPathname)
                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(inspect['numRows']), \
                    "    numCols:", "{:,}".format(inspect['numCols'])

                # should match # of cols in header or ??
                self.assertEqual(
                    inspect['numCols'], colCount,
                    "parse created result with the wrong number of cols %s %s"
                    % (inspect['numCols'], colCount))
                self.assertEqual(inspect['numRows'], rowCount,
                    "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                    (inspect['numRows'], rowCount))

            # compare each to 0
            for i in range(1, REPEAT):
                hex_key_i = hex_key + "_" + str(i)
                hex_key_0 = hex_key + "_0"

                print "\nComparing %s to %s" % (hex_key_i, hex_key_0)
                if 1 == 0:
                    execExpr = "%s[1,]+%s[1,]" % (hex_key_0, hex_key_i)
                    resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                       timeoutSecs=30)
                    execExpr = "%s[,1]+%s[,1]" % (hex_key_0, hex_key_i)
                    resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                       timeoutSecs=30)

                execExpr = "%s+%s" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "%s!=%s" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "%s==%s" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "sum(%s==%s)" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "s=sum(%s==%s)" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)

                execExpr = "s=c(1); s=sum(%s==%s)" % (hex_key_0, hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "n=c(1); n=nrow(%s)*ncol(%s))" % (hex_key,
                                                             hex_key_i)
                resultExec, result = h2e.exec_expr(execExpr=execExpr,
                                                   timeoutSecs=30)
                execExpr = "r=c(1); r=s==n"
                resultExec, result, h2e.exec_expr(execExpr=execExpr,
                                                  timeoutSecs=30)
                print "result:", result
Exemplo n.º 59
0
    def test_four_billion_rows_fvec(self):
        timeoutSecs = 1500

        importFolderPath = "billions"
        csvFilenameList = [
            "four_billion_rows.csv",
        ]
        for csvFilename in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            start = time.time()

            # Parse*********************************
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=180,
                                           retryDelaySecs=3)
            elapsed = time.time() - start
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            # Inspect*********************************
            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            byteSize = inspect['byteSize']
            print "\n" + csvFilename, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols), \
                "    byteSize:", "{:,}".format(byteSize)

            expectedRowSize = numCols * 1  # plus output
            # expectedValueSize = expectedRowSize * numRows
            expectedValueSize = 8001271520
            self.assertEqual(byteSize, expectedValueSize,
                msg='byteSize %s is not expected: %s' % \
                (byteSize, expectedValueSize))

            summaryResult = h2o_cmd.runSummary(
                key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            self.assertEqual(
                2,
                numCols,
                msg="generated %s cols (including output).  parsed to %s cols"
                % (2, numCols))
            self.assertEqual(4 * 1000000000,
                             numRows,
                             msg="generated %s rows, parsed to %s rows" %
                             (4 * 1000000000, numRows))

            # KMeans*********************************
            kwargs = {
                'k': 3,
                'initialization': 'Furthest',
                'max_iter': 10,
                'normalize': 0,
                'destination_key': 'junk.hex',
                'seed': 265211114317615310,
            }

            timeoutSecs = 900
            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                       timeoutSecs=timeoutSecs,
                                       retryDelaySecs=4,
                                       **kwargs)

            # GLM*********************************
            print "\n" + csvFilename
            kwargs = {
                'response': 'C1',
                'n_folds': 0,
                'family': 'binomial',
            }
            # one coefficient is checked a little more
            colX = 1

            # convert to binomial
            execExpr = "A.hex=%s" % parseResult['destination_key']
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
            execExpr = "A.hex[,%s]=(A.hex[,%s]==%s)" % ('1', '1', 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
            aHack = {'destination_key': "A.hex"}

            # L2
            timeoutSecs = 900
            kwargs.update({'alpha': 0, 'lambda': 0})
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=aHack,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            elapsed = time.time() - start
            print "glm (L2) end on ", csvFilename, 'took', elapsed, 'seconds.', "%d pct. of timeout" % (
                (elapsed / timeoutSecs) * 100)
            h2o_glm.simpleCheckGLM(self, glm, "C" + str(colX), **kwargs)
    def test_GLM2_mnist_reals(self):
        importFolderPath = "mnist"
        csvFilelist = [
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600),
        ]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path="mnist/" + testCsvFilename,
                                           schema='put',
                                           hex_key=testKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path="mnist/" + trainCsvFilename,
                                           schema='put',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM****************************************
            print "This is the pruned x GLM will use"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)
            print "x:", x

            modelKey = "mnist"
            params = {
                'response': y,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.0,
                'max_iter': 10,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                'destination_key': modelKey
            }

            for c in [5]:
                print "Trying binomial with case:", c
                execExpr = "A.hex=%s;A.hex[,%s]=(A.hex[,%s]==%s)" % (
                    trainKey, y + 1, y + 1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                kwargs = params.copy()

                timeoutSecs = 1800
                start = time.time()
                aHack = {'destination_key': 'A.hex'}
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=60,
                                     **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)

                # Score **********************************************
                execExpr = "B.hex=%s;B.hex[,%s]=(B.hex[,%s]==%s)" % (
                    testKey, y + 1, y + 1, c)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                print "Problems with test data having different enums than train? just use train for now"
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key="B.hex",
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual="B.hex",
                    vactual='C' + str(y + 1),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                # self.assertLess(pctWrong, 8,"Should see less than 7 pct error (class = 4): %s" % pctWrong)

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)